Author: kihwal Date: Mon Mar 24 15:39:00 2014 New Revision: 1580886 URL: http://svn.apache.org/r1580886 Log: HDFS-3087. Decomissioning on NN restart can complete without blocks being replicated. Contributed by Rushabh S Shah.
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1580886&r1=1580885&r2=1580886&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Mon Mar 24 15:39:00 2014 @@ -265,6 +265,9 @@ Release 2.5.0 - UNRELEASED HDFS-6112. NFS Gateway docs are incorrect for allowed hosts configuration. (atm) + HDFS-3087. Decomissioning on NN restart can complete without blocks being + replicated. (Rushabh S Shah via kihwal) + Release 2.4.0 - UNRELEASED INCOMPATIBLE CHANGES Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java?rev=1580886&r1=1580885&r2=1580886&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java Mon Mar 24 15:39:00 2014 @@ -695,5 +695,20 @@ public class DatanodeDescriptor extends public void setLastCachingDirectiveSentTimeMs(long time) { this.lastCachingDirectiveSentTimeMs = time; } + + /** + * checks whether atleast first block report has been received + * @return + */ + public boolean checkBlockReportReceived() { + if(this.getStorageInfos().length == 0) { + return false; + } + for(DatanodeStorageInfo storageInfo: this.getStorageInfos()) { + if(storageInfo.getBlockReportCount() == 0 ) + return false; + } + return true; + } } Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java?rev=1580886&r1=1580885&r2=1580886&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java Mon Mar 24 15:39:00 2014 @@ -711,7 +711,7 @@ public class DatanodeManager { boolean checkDecommissionState(DatanodeDescriptor node) { // Check to see if all blocks in this decommissioned // node has reached their target replication factor. - if (node.isDecommissionInProgress()) { + if (node.isDecommissionInProgress() && node.checkBlockReportReceived()) { if (!blockManager.isReplicationInProgress(node)) { node.setDecommissioned(); LOG.info("Decommission complete for " + node); Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java?rev=1580886&r1=1580885&r2=1580886&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java Mon Mar 24 15:39:00 2014 @@ -23,6 +23,7 @@ import static org.junit.Assert.assertTru import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; @@ -829,4 +830,63 @@ public class TestDecommission { fdos.close(); } + + /** + * Tests restart of namenode while datanode hosts are added to exclude file + **/ + @Test(timeout=360000) + public void testDecommissionWithNamenodeRestart()throws IOException, InterruptedException { + LOG.info("Starting test testDecommissionWithNamenodeRestart"); + int numNamenodes = 1; + int numDatanodes = 1; + int replicas = 1; + + startCluster(numNamenodes, numDatanodes, conf); + Path file1 = new Path("testDecommission.dat"); + FileSystem fileSys = cluster.getFileSystem(); + writeFile(fileSys, file1, replicas); + + DFSClient client = getDfsClient(cluster.getNameNode(), conf); + DatanodeInfo[] info = client.datanodeReport(DatanodeReportType.LIVE); + DatanodeID excludedDatanodeID = info[0]; + String excludedDatanodeName = info[0].getXferAddr(); + + writeConfigFile(excludeFile, new ArrayList<String>(Arrays.asList(excludedDatanodeName))); + + //Add a new datanode to cluster + cluster.startDataNodes(conf, 1, true, null, null, null, null); + numDatanodes+=1; + + assertEquals("Number of datanodes should be 2 ", 2, cluster.getDataNodes().size()); + //Restart the namenode + cluster.restartNameNode(); + DatanodeInfo datanodeInfo = NameNodeAdapter.getDatanode( + cluster.getNamesystem(), excludedDatanodeID); + waitNodeState(datanodeInfo, AdminStates.DECOMMISSIONED); + + // Ensure decommissioned datanode is not automatically shutdown + assertEquals("All datanodes must be alive", numDatanodes, + client.datanodeReport(DatanodeReportType.LIVE).length); + // wait for the block to be replicated + int tries = 0; + while (tries++ < 20) { + try { + Thread.sleep(1000); + if (checkFile(fileSys, file1, replicas, datanodeInfo.getXferAddr(), + numDatanodes) == null) { + break; + } + } catch (InterruptedException ie) { + } + } + assertTrue("Checked if block was replicated after decommission, tried " + + tries + " times.", tries < 20); + cleanupFile(fileSys, file1); + + // Restart the cluster and ensure recommissioned datanodes + // are allowed to register with the namenode + cluster.shutdown(); + startCluster(numNamenodes, numDatanodes, conf); + cluster.shutdown(); + } }