KevinWikant commented on a change in pull request #3675: URL: https://github.com/apache/hadoop/pull/3675#discussion_r762076676
########## File path: hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java ########## @@ -1654,4 +1658,139 @@ public Boolean get() { cleanupFile(fileSys, file); } + + /** + * Test DatanodeAdminManager logic to re-queue unhealthy decommissioning nodes + * which are blocking the decommissioning of healthy nodes. + * Force the tracked nodes set to be filled with nodes lost while decommissioning, + * then decommission healthy nodes & validate they are decommissioned eventually. + */ + @Test(timeout = 120000) + public void testRequeueUnhealthyDecommissioningNodes() throws Exception { + // Allow 3 datanodes to be decommissioned at a time + getConf().setInt(DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_MAX_CONCURRENT_TRACKED_NODES, 3); + // Disable the normal monitor runs + getConf() + .setInt(MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, Integer.MAX_VALUE); + + // Start cluster with 6 datanodes + startCluster(1, 6); + final FSNamesystem namesystem = getCluster().getNamesystem(); + final BlockManager blockManager = namesystem.getBlockManager(); + final DatanodeManager datanodeManager = blockManager.getDatanodeManager(); + final DatanodeAdminManager decomManager = datanodeManager.getDatanodeAdminManager(); + assertEquals(6, getCluster().getDataNodes().size()); + + // 3 datanodes will be "live" datanodes that are expected to be decommissioned eventually + final List<DatanodeDescriptor> liveNodes = getCluster().getDataNodes().subList(3, 6).stream() + .map(dn -> getDatanodeDesriptor(namesystem, dn.getDatanodeUuid())) + .collect(Collectors.toList()); + assertEquals(3, liveNodes.size()); + + // 3 datanodes will be "dead" datanodes that are expected to never be decommissioned + final List<DatanodeDescriptor> deadNodes = getCluster().getDataNodes().subList(0, 3).stream() + .map(dn -> getDatanodeDesriptor(namesystem, dn.getDatanodeUuid())) + .collect(Collectors.toList()); + assertEquals(3, deadNodes.size()); + + // Need to create some data or "isNodeHealthyForDecommissionOrMaintenance" + // may unexpectedly return true for a dead node + writeFile(getCluster().getFileSystem(), new Path("/tmp/test1"), 1, 100); Review comment: should use a larger replication factor here to ensure there are LowRendundancy blocks -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-issues-h...@hadoop.apache.org