YARN-5195. RM intermittently crashed with NPE while handling APP_ATTEMPT_REMOVED event when async-scheduling enabled in CapacityScheduler. (sandflee via wangda)
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/d62e121f Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/d62e121f Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/d62e121f Branch: refs/heads/HADOOP-12756 Commit: d62e121ffc0239e7feccc1e23ece92c5fac685f6 Parents: 2d8d183 Author: Wangda Tan <wan...@apache.org> Authored: Tue Jul 26 21:22:59 2016 -0700 Committer: Wangda Tan <wan...@apache.org> Committed: Tue Jul 26 21:22:59 2016 -0700 ---------------------------------------------------------------------- .../scheduler/capacity/CapacityScheduler.java | 9 ++++- .../capacity/TestCapacityScheduler.java | 40 ++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/d62e121f/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index ee62a70..bedf455 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -1209,11 +1209,18 @@ public class CapacityScheduler extends } @VisibleForTesting - protected synchronized void allocateContainersToNode(FiCaSchedulerNode node) { + public synchronized void allocateContainersToNode(FiCaSchedulerNode node) { if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext.isSchedulerReadyForAllocatingContainers()) { return; } + + if (!nodeTracker.exists(node.getNodeID())) { + LOG.info("Skipping scheduling as the node " + node.getNodeID() + + " has been removed"); + return; + } + // reset allocation and reservation stats before we start doing any work updateSchedulerHealth(lastNodeUpdateTime, node, new CSAssignment(Resources.none(), NodeType.NODE_LOCAL)); http://git-wip-us.apache.org/repos/asf/hadoop/blob/d62e121f/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java index fb021c0..d3567f5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java @@ -3375,4 +3375,44 @@ public class TestCapacityScheduler { Assert.assertEquals(availableResource.getMemorySize(), 0); Assert.assertEquals(availableResource.getVirtualCores(), 0); } + + @Test + public void testSchedulingOnRemovedNode() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, + ResourceScheduler.class); + conf.setBoolean( + CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE, + false); + + MockRM rm = new MockRM(conf); + rm.start(); + RMApp app = rm.submitApp(100); + rm.drainEvents(); + + MockNM nm1 = rm.registerNode("127.0.0.1:1234", 10240, 10); + MockAM am = MockRM.launchAndRegisterAM(app, rm, nm1); + + //remove nm2 to keep am alive + MockNM nm2 = rm.registerNode("127.0.0.1:1235", 10240, 10); + + am.allocate(ResourceRequest.ANY, 2048, 1, null); + + CapacityScheduler scheduler = + (CapacityScheduler) rm.getRMContext().getScheduler(); + FiCaSchedulerNode node = + (FiCaSchedulerNode) + scheduler.getNodeTracker().getNode(nm2.getNodeId()); + scheduler.handle(new NodeRemovedSchedulerEvent( + rm.getRMContext().getRMNodes().get(nm2.getNodeId()))); + // schedulerNode is removed, try allocate a container + scheduler.allocateContainersToNode(node); + + AppAttemptRemovedSchedulerEvent appRemovedEvent1 = + new AppAttemptRemovedSchedulerEvent( + am.getApplicationAttemptId(), + RMAppAttemptState.FINISHED, false); + scheduler.handle(appRemovedEvent1); + rm.stop(); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org