[ https://issues.apache.org/jira/browse/HDFS-16855?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17640039#comment-17640039 ]
ASF GitHub Bot commented on HDFS-16855: --------------------------------------- dingshun3016 opened a new pull request, #5170: URL: https://github.com/apache/hadoop/pull/5170 When patching the datanode's fine-grained lock, we found that the datanode couldn't start,maybe happened deadlock,when addBlockPool, so we can remove it. org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl#addBlockPool get writeLock org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl#deepCopyReplica need readLock because it is not the same thread, so the write lock cannot be downgraded to a read lock > Remove the redundant write lock in addBlockPool > ----------------------------------------------- > > Key: HDFS-16855 > URL: https://issues.apache.org/jira/browse/HDFS-16855 > Project: Hadoop HDFS > Issue Type: Bug > Reporter: dingshun > Priority: Major > > When patching the datanode's fine-grained lock, we found that the datanode > couldn't start,maybe happened deadlock,when addBlockPool, so we can remove it. > {code:java} > // getspaceused classname > <property> > <name>fs.getspaceused.classname</name> > > <value>org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.ReplicaCachingGetSpaceUsed</value> > </property> {code} > {code:java} > // > org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl#addBlockPool > > // get writeLock > @Override > public void addBlockPool(String bpid, Configuration conf) > throws IOException { > LOG.info("Adding block pool " + bpid); > AddBlockPoolException volumeExceptions = new AddBlockPoolException(); > try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.BLOCK_POOl, > bpid)) { > try { > volumes.addBlockPool(bpid, conf); > } catch (AddBlockPoolException e) { > volumeExceptions.mergeException(e); > } > volumeMap.initBlockPool(bpid); > Set<String> vols = storageMap.keySet(); > for (String v : vols) { > lockManager.addLock(LockLevel.VOLUME, bpid, v); > } > } > > } {code} > {code:java} > // > org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl#deepCopyReplica > // need readLock > void replicas(String bpid, Consumer<Iterator<ReplicaInfo>> consumer) { > LightWeightResizableGSet<Block, ReplicaInfo> m = null; > try (AutoCloseDataSetLock l = lockManager.readLock(LockLevel.BLOCK_POOl, > bpid)) { > m = map.get(bpid); > if (m !=null) { > m.getIterator(consumer); > } > } > } {code} > > because it is not the same thread, so the write lock cannot be downgraded to > a read lock > {code:java} > void addBlockPool(final String bpid, final Configuration conf) throws > IOException { > long totalStartTime = Time.monotonicNow(); > final Map<FsVolumeSpi, IOException> unhealthyDataDirs = > new ConcurrentHashMap<FsVolumeSpi, IOException>(); > List<Thread> blockPoolAddingThreads = new ArrayList<Thread>(); > for (final FsVolumeImpl v : volumes) { > Thread t = new Thread() { > public void run() { > try (FsVolumeReference ref = v.obtainReference()) { > FsDatasetImpl.LOG.info("Scanning block pool " + bpid + > " on volume " + v + "..."); > long startTime = Time.monotonicNow(); > v.addBlockPool(bpid, conf); > long timeTaken = Time.monotonicNow() - startTime; > FsDatasetImpl.LOG.info("Time taken to scan block pool " + bpid + > " on " + v + ": " + timeTaken + "ms"); > } catch (IOException ioe) { > FsDatasetImpl.LOG.info("Caught exception while scanning " + v + > ". Will throw later.", ioe); > unhealthyDataDirs.put(v, ioe); > } > } > }; > blockPoolAddingThreads.add(t); > t.start(); > } > for (Thread t : blockPoolAddingThreads) { > try { > t.join(); > } catch (InterruptedException ie) { > throw new IOException(ie); > } > } > } {code} > -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org