Duo Zhang created HBASE-30118:
---------------------------------
Summary: Dead lock in metrics system cause UTs hang
Key: HBASE-30118
URL: https://issues.apache.org/jira/browse/HBASE-30118
Project: HBase
Issue Type: Bug
Components: hadoop2, hadoop3, metrics, test
Reporter: Duo Zhang
{noformat}
"RS_OPEN_META-regionserver/2cd189b8f196:0-0" daemon prio=5 tid=470 blocked
java.lang.Thread.State: BLOCKED
at
app//org.apache.hadoop.metrics2.impl.MetricsSystemImpl.register(MetricsSystemImpl.java:223)
at
app//org.apache.hadoop.hbase.metrics.BaseSourceImpl.<init>(BaseSourceImpl.java:115)
at
app//org.apache.hadoop.hbase.io.MetricsIOSourceImpl.<init>(MetricsIOSourceImpl.java:44)
at
app//org.apache.hadoop.hbase.io.MetricsIOSourceImpl.<init>(MetricsIOSourceImpl.java:39)
at
app//org.apache.hadoop.hbase.regionserver.MetricsRegionServerSourceFactoryImpl.createIO(MetricsRegionServerSourceFactoryImpl.java:99)
at
app//org.apache.hadoop.hbase.io.MetricsIO.<init>(MetricsIO.java:36)
at
app//org.apache.hadoop.hbase.io.MetricsIO.getInstance(MetricsIO.java:52)
at
app//org.apache.hadoop.hbase.io.hfile.HFile.updateWriteLatency(HFile.java:205)
at
app//org.apache.hadoop.hbase.io.hfile.HFileBlock$Writer.finishBlockAndWriteHeaderAndData(HFileBlock.java:1051)
at
app//org.apache.hadoop.hbase.io.hfile.HFileBlock$Writer.writeHeaderAndData(HFileBlock.java:1036)
at
app//org.apache.hadoop.hbase.io.hfile.HFileWriterImpl.finishBlock(HFileWriterImpl.java:384)
at
app//org.apache.hadoop.hbase.io.hfile.HFileWriterImpl.close(HFileWriterImpl.java:653)
at
app//org.apache.hadoop.hbase.regionserver.StoreFileWriter$SingleStoreFileWriter.close(StoreFileWriter.java:781)
at
app//org.apache.hadoop.hbase.regionserver.StoreFileWriter.close(StoreFileWriter.java:301)
at
app//org.apache.hadoop.hbase.regionserver.StoreFlusher.finalizeWriter(StoreFlusher.java:70)
at
app//org.apache.hadoop.hbase.regionserver.DefaultStoreFlusher.flushSnapshot(DefaultStoreFlusher.java:74)
at
app//org.apache.hadoop.hbase.regionserver.HStore.flushCache(HStore.java:836)
at
app//org.apache.hadoop.hbase.regionserver.HStore$StoreFlusherImpl.flushCache(HStore.java:1987)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.internalFlushCacheAndCommit(HRegion.java:3158)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.internalFlushcache(HRegion.java:2866)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.replayRecoveredEditsIfAny(HRegion.java:5623)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.initializeRegionInternals(HRegion.java:1099)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.initialize(HRegion.java:1033)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.openHRegion(HRegion.java:8038)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.openHRegionFromTableDir(HRegion.java:7992)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.openHRegion(HRegion.java:7964)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.openHRegion(HRegion.java:7912)
at
app//org.apache.hadoop.hbase.regionserver.HRegion.openHRegion(HRegion.java:7843)
at
app//org.apache.hadoop.hbase.regionserver.handler.AssignRegionHandler.process(AssignRegionHandler.java:143)
at
app//org.apache.hadoop.hbase.executor.EventHandler.run(EventHandler.java:104)
at
[email protected]/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at
[email protected]/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at [email protected]/java.lang.Thread.run(Thread.java:840)
{noformat}
{noformat}
"HBase-Metrics2-1" daemon prio=5 tid=199 in Object.wait()
java.lang.Thread.State: WAITING (on object monitor)
at [email protected]/jdk.internal.misc.Unsafe.park(Native Method)
at
[email protected]/java.util.concurrent.locks.LockSupport.park(LockSupport.java:211)
at
[email protected]/java.util.concurrent.CompletableFuture$Signaller.block(CompletableFuture.java:1864)
at
[email protected]/java.util.concurrent.ForkJoinPool.unmanagedBlock(ForkJoinPool.java:3465)
at
[email protected]/java.util.concurrent.ForkJoinPool.managedBlock(ForkJoinPool.java:3436)
at
[email protected]/java.util.concurrent.CompletableFuture.waitingGet(CompletableFuture.java:1898)
at
[email protected]/java.util.concurrent.CompletableFuture.get(CompletableFuture.java:2072)
at
app//org.apache.hadoop.hbase.util.FutureUtils.get(FutureUtils.java:182)
at
app//org.apache.hadoop.hbase.client.TableOverAsyncTable.get(TableOverAsyncTable.java:188)
at
app//org.apache.hadoop.hbase.MetaTableAccessor.getTableState(MetaTableAccessor.java:601)
at
app//org.apache.hadoop.hbase.master.TableStateManager.readMetaState(TableStateManager.java:177)
at
app//org.apache.hadoop.hbase.master.TableStateManager.isTablePresent(TableStateManager.java:107)
at
app//org.apache.hadoop.hbase.master.HMaster.getTableDescriptors(HMaster.java:3856)
at
app//org.apache.hadoop.hbase.master.HMaster.listTableDescriptors(HMaster.java:3806)
at
app//org.apache.hadoop.hbase.master.MetricsMasterWrapperImpl.getRegionCounts(MetricsMasterWrapperImpl.java:227)
at
app//org.apache.hadoop.hbase.master.MetricsMasterSourceImpl.getMetrics(MetricsMasterSourceImpl.java:95)
at
app//org.apache.hadoop.metrics2.impl.MetricsSourceAdapter.getMetrics(MetricsSourceAdapter.java:200)
at
app//org.apache.hadoop.metrics2.impl.MetricsSourceAdapter.updateJmxCache(MetricsSourceAdapter.java:183)
at
app//org.apache.hadoop.metrics2.impl.MetricsSourceAdapter.getMBeanInfo(MetricsSourceAdapter.java:156)
at
[email protected]/com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.getClassName(DefaultMBeanServerInterceptor.java:1766)
at
[email protected]/com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.safeGetClassName(DefaultMBeanServerInterceptor.java:1575)
at
[email protected]/com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.checkMBeanPermission(DefaultMBeanServerInterceptor.java:1776)
at
[email protected]/com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.exclusiveUnregisterMBean(DefaultMBeanServerInterceptor.java:426)
at
[email protected]/com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.unregisterMBean(DefaultMBeanServerInterceptor.java:411)
at
[email protected]/com.sun.jmx.mbeanserver.JmxMBeanServer.unregisterMBean(JmxMBeanServer.java:547)
at
app//org.apache.hadoop.metrics2.util.MBeans.unregister(MBeans.java:144)
at
app//org.apache.hadoop.metrics2.impl.MetricsSourceAdapter.stopMBeans(MetricsSourceAdapter.java:228)
at
app//org.apache.hadoop.metrics2.impl.MetricsSourceAdapter.stop(MetricsSourceAdapter.java:213)
at
app//org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stopSources(MetricsSystemImpl.java:464)
at
app//org.apache.hadoop.metrics2.impl.MetricsSystemImpl.stop(MetricsSystemImpl.java:212)
at
app//org.apache.hadoop.metrics2.impl.JmxCacheBuster$JmxCacheBusterRunnable.run(JmxCacheBuster.java:98)
at
[email protected]/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539)
at
[email protected]/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at
[email protected]/java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:304)
at
[email protected]/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at
[email protected]/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at [email protected]/java.lang.Thread.run(Thread.java:840)
{noformat}
In a UT, we kill the regionserver hosting meta, so we will assign meta to a new
regionserver, and finally when updating metrics, it blocks on the metrics lock
for registering. But at the same time, JmxCacheBuster is trying to recreate all
the jmx metrics and finally lead to access meta region under the metrics lock,
and is blocked since meta is not online.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)