Author: suresh Date: Mon Jun 10 17:59:29 2013 New Revision: 1491544 URL: http://svn.apache.org/r1491544 Log: HDFS-4261. Merge r1488865 from branch-1
Modified: hadoop/common/branches/branch-1.2/CHANGES.txt hadoop/common/branches/branch-1.2/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java hadoop/common/branches/branch-1.2/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java Modified: hadoop/common/branches/branch-1.2/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.2/CHANGES.txt?rev=1491544&r1=1491543&r2=1491544&view=diff ============================================================================== --- hadoop/common/branches/branch-1.2/CHANGES.txt (original) +++ hadoop/common/branches/branch-1.2/CHANGES.txt Mon Jun 10 17:59:29 2013 @@ -29,6 +29,9 @@ Release 1.2.1 - Unreleased HDFS-4699. Additional conditions for avoiding unnecessary DataNode.checkDiskError calls. (Chris Nauroth via kihwal) + HDFS-4261. Fix bugs in Balaner causing infinite loop and + TestBalancerWithNodeGroup timeing out. (Junping Du via szetszwo) + Release 1.2.0 - 2013.05.05 INCOMPATIBLE CHANGES Modified: hadoop/common/branches/branch-1.2/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.2/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java?rev=1491544&r1=1491543&r2=1491544&view=diff ============================================================================== --- hadoop/common/branches/branch-1.2/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java (original) +++ hadoop/common/branches/branch-1.2/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java Mon Jun 10 17:59:29 2013 @@ -193,6 +193,8 @@ public class Balancer implements Tool { */ public static final int MAX_NUM_CONCURRENT_MOVES = 5; + public static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5; + private Configuration conf; private double threshold = 10D; @@ -746,6 +748,7 @@ public class Balancer implements Tool { long startTime = Util.now(); this.blocksToReceive = 2*scheduledSize; boolean isTimeUp = false; + int noPendingBlockIteration = 0; while(!isTimeUp && scheduledSize > 0 && (!srcBlockList.isEmpty() || blocksToReceive > 0)) { PendingBlockMove pendingBlock = chooseNextBlockToMove(); @@ -769,7 +772,15 @@ public class Balancer implements Tool { LOG.warn(StringUtils.stringifyException(e)); return; } - } + } else { + // source node cannot find a pendingBlockToMove, iteration +1 + noPendingBlockIteration++; + // in case no blocks can be moved for source node's task, + // jump out of while-loop after 5 iterations. + if (noPendingBlockIteration >= MAX_NO_PENDING_BLOCK_ITERATIONS) { + scheduledSize = 0; + } + } // check if time is up or not if (Util.now()-startTime > MAX_ITERATION_TIME) { @@ -1496,7 +1507,11 @@ public class Balancer implements Tool { Formatter formatter = new Formatter(System.out); System.out.println("Time Stamp Iteration# Bytes Already Moved Bytes Left To Move Bytes Being Moved"); int iterations = 0; + while (true) { + // clean all lists at the beginning of balancer iteration. + resetData(); + /* get all live datanodes of a cluster and their disk usage * decide the number of bytes need to be moved */ @@ -1547,9 +1562,6 @@ public class Balancer implements Tool { return NO_MOVE_PROGRESS; } } - - // clean all lists - resetData(); try { Thread.sleep(2*conf.getLong("dfs.heartbeat.interval", 3)); Modified: hadoop/common/branches/branch-1.2/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.2/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java?rev=1491544&r1=1491543&r2=1491544&view=diff ============================================================================== --- hadoop/common/branches/branch-1.2/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java (original) +++ hadoop/common/branches/branch-1.2/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java Mon Jun 10 17:59:29 2013 @@ -216,7 +216,7 @@ public class TestBalancerWithNodeGroup { * to n0 or n1 as balancer policy with node group. Thus, we expect the balancer * to end in 5 iterations without move block process. */ - @Test + @Test(timeout=60000) public void testBalancerEndInNoMoveProgress() throws Exception { Configuration conf = createConf(); long[] capacities = new long[]{CAPACITY, CAPACITY, CAPACITY, CAPACITY}; @@ -255,7 +255,7 @@ public class TestBalancerWithNodeGroup { * Create a cluster with even distribution, and a new empty node is added to * the cluster, then test rack locality for balancer policy. */ - @Test + @Test(timeout=60000) public void testBalancerWithRackLocality() throws Exception { Configuration conf = createConf(); long[] capacities = new long[]{CAPACITY, CAPACITY}; @@ -294,7 +294,7 @@ public class TestBalancerWithNodeGroup { totalCapacity += newCapacity; // run balancer and validate results - runBalancer(conf, totalUsedSpace, totalCapacity); + runBalancerCanFinish(conf, totalUsedSpace, totalCapacity); DatanodeInfo[] datanodeReport = client.getDatanodeReport(DatanodeReportType.ALL); @@ -321,7 +321,7 @@ public class TestBalancerWithNodeGroup { /** Create a cluster with even distribution, and a new empty node is added to * the cluster, then test rack locality for balancer policy. **/ - @Test + @Test(timeout=60000) public void testBalancerWithNodeGroup() throws Exception { Configuration conf = createConf(); long[] capacities = new long[]{CAPACITY, CAPACITY};