Repository: mesos
Updated Branches:
  refs/heads/master 9da692ac7 -> e3836e964


Fixed master to properly handle pending tasks.

Pending tasks are always removed from frameworks `pending` map,
irrespective of whether the task launch is successful or not.

Review: https://reviews.apache.org/r/52440


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e3836e96
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e3836e96
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e3836e96

Branch: refs/heads/master
Commit: e3836e964c3f75b9c3961b8c527beb6d5a5f1130
Parents: 9da692a
Author: Vinod Kone <vinodk...@gmail.com>
Authored: Fri Sep 30 12:38:01 2016 -0700
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Fri Sep 30 13:19:33 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp                    | 41 +++++++++------
 src/tests/master_authorization_tests.cpp | 72 +++++++++++++++++++++++++--
 src/tests/master_validation_tests.cpp    | 33 ++++++++++++
 3 files changed, 128 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e3836e96/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 756ab54..c83ee2f 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -3685,6 +3685,15 @@ void Master::_accept(
       }();
 
       foreach (const TaskInfo& task, tasks) {
+        // Remove the task from being pending.
+        framework->pendingTasks.erase(task.task_id());
+        if (slave != nullptr) {
+          slave->pendingTasks[framework->id()].erase(task.task_id());
+          if (slave->pendingTasks[framework->id()].empty()) {
+            slave->pendingTasks.erase(framework->id());
+          }
+        }
+
         const TaskStatus::Reason reason =
             slave == nullptr ? TaskStatus::REASON_SLAVE_REMOVED
                           : TaskStatus::REASON_SLAVE_DISCONNECTED;
@@ -4183,6 +4192,21 @@ void Master::_accept(
         const ExecutorInfo& executor = operation.launch_group().executor();
         const TaskGroupInfo& taskGroup = operation.launch_group().task_group();
 
+        // Remove all the tasks from being pending.
+        hashset<TaskID> killed;
+        foreach (const TaskInfo& task, taskGroup.tasks()) {
+          bool pending = framework->pendingTasks.contains(task.task_id());
+          framework->pendingTasks.erase(task.task_id());
+          slave->pendingTasks[framework->id()].erase(task.task_id());
+          if (slave->pendingTasks[framework->id()].empty()) {
+            slave->pendingTasks.erase(framework->id());
+          }
+
+          if (!pending) {
+            killed.insert(task.task_id());
+          }
+        }
+
         // Note that we do not fill in the `ExecutorInfo.framework_id`
         // since we do not have to support backwards compatiblity like
         // in the `Launch` operation case.
@@ -4267,22 +4291,9 @@ void Master::_accept(
           continue;
         }
 
-        // Remove all the tasks from being pending. If any of the tasks
-        // have been killed in the interim, we will send TASK_KILLED
-        // for all other tasks in the group, since a TaskGroup must
-        // be delivered in its entirety.
-        hashset<TaskID> killed;
-        foreach (const TaskInfo& task, taskGroup.tasks()) {
-          bool pending = framework->pendingTasks.contains(task.task_id());
-          framework->pendingTasks.erase(task.task_id());
-
-          if (!pending) {
-            killed.insert(task.task_id());
-          }
-        }
-
         // If task(s) were killed, send TASK_KILLED for
-        // all of the remaining tasks.
+        // all of the remaining tasks, since a TaskGroup must
+        // be delivered in its entirety.
         //
         // TODO(bmahler): Do this killing when processing
         // the `Kill` call, rather than doing it here.

http://git-wip-us.apache.org/repos/asf/mesos/blob/e3836e96/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp 
b/src/tests/master_authorization_tests.cpp
index 74422c7..eaf0f01 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -227,6 +227,17 @@ TEST_F(MasterAuthorizationTest, UnauthorizedTask)
   EXPECT_EQ(TASK_ERROR, status.get().state());
   EXPECT_EQ(TaskStatus::REASON_TASK_UNAUTHORIZED, status.get().reason());
 
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
@@ -326,13 +337,24 @@ TEST_F(MasterAuthorizationTest, UnauthorizedTaskGroup)
   EXPECT_EQ(TASK_ERROR, task2Status->state());
   EXPECT_EQ(TaskStatus::REASON_TASK_GROUP_UNAUTHORIZED, task2Status->reason());
 
+  // Make sure the task group is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
 
 
 // This test verifies that a 'killTask()' that comes before
-// '_launchTasks()' is called results in TASK_KILLED.
+// '_accept()' is called results in TASK_KILLED.
 TEST_F(MasterAuthorizationTest, KillTask)
 {
   MockAuthorizer authorizer;
@@ -398,6 +420,17 @@ TEST_F(MasterAuthorizationTest, KillTask)
   // returned to the allocator.
   AWAIT_READY(recoverResources);
 
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
@@ -517,13 +550,24 @@ TEST_F(MasterAuthorizationTest, 
KillPendingTaskInTaskGroup)
   // returned to the allocator.
   AWAIT_READY(recoverResources);
 
+  // Make sure the task group is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
 
 
 // This test verifies that a slave removal that comes before
-// '_launchTasks()' is called results in TASK_LOST.
+// '_accept()' is called results in TASK_LOST.
 TEST_F(MasterAuthorizationTest, SlaveRemoved)
 {
   MockAuthorizer authorizer;
@@ -609,6 +653,17 @@ TEST_F(MasterAuthorizationTest, SlaveRemoved)
   EXPECT_EQ(
       1u, stats.values["master/task_lost/source_master/reason_slave_removed"]);
 
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
@@ -704,13 +759,24 @@ TEST_F(MasterAuthorizationTest, SlaveDisconnected)
       1u,
       stats.values["master/task_lost/source_master/reason_slave_removed"]);
 
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
 
 
 // This test verifies that a framework removal that comes before
-// '_launchTasks()' is called results in recovery of resources.
+// '_accept()' is called results in recovery of resources.
 TEST_F(MasterAuthorizationTest, FrameworkRemoved)
 {
   MockAuthorizer authorizer;

http://git-wip-us.apache.org/repos/asf/mesos/blob/e3836e96/src/tests/master_validation_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_validation_tests.cpp 
b/src/tests/master_validation_tests.cpp
index 16c5773..99e350e 100644
--- a/src/tests/master_validation_tests.cpp
+++ b/src/tests/master_validation_tests.cpp
@@ -709,6 +709,17 @@ TEST_F(TaskValidationTest, ExecutorUsesInvalidFrameworkID)
   EXPECT_TRUE(strings::startsWith(
       status.get().message(), "ExecutorInfo has an invalid FrameworkID"));
 
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
@@ -1806,6 +1817,17 @@ TEST_F(TaskGroupValidationTest, 
ExecutorUsesDockerContainerInfo)
       "Docker ContainerInfo is not supported on the executor",
       task2Status->message());
 
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }
@@ -1885,6 +1907,17 @@ TEST_F(TaskGroupValidationTest, 
ExecutorWithoutFrameworkId)
   EXPECT_EQ(TASK_ERROR, task2Status->state());
   EXPECT_EQ(TaskStatus::REASON_TASK_GROUP_INVALID, task2Status->reason());
 
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We pause the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
   driver.stop();
   driver.join();
 }

Reply via email to