Repository: mesos Updated Branches: refs/heads/master 9da692ac7 -> e3836e964
Fixed master to properly handle pending tasks. Pending tasks are always removed from frameworks `pending` map, irrespective of whether the task launch is successful or not. Review: https://reviews.apache.org/r/52440 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e3836e96 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e3836e96 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e3836e96 Branch: refs/heads/master Commit: e3836e964c3f75b9c3961b8c527beb6d5a5f1130 Parents: 9da692a Author: Vinod Kone <vinodk...@gmail.com> Authored: Fri Sep 30 12:38:01 2016 -0700 Committer: Vinod Kone <vinodk...@gmail.com> Committed: Fri Sep 30 13:19:33 2016 -0700 ---------------------------------------------------------------------- src/master/master.cpp | 41 +++++++++------ src/tests/master_authorization_tests.cpp | 72 +++++++++++++++++++++++++-- src/tests/master_validation_tests.cpp | 33 ++++++++++++ 3 files changed, 128 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/e3836e96/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index 756ab54..c83ee2f 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -3685,6 +3685,15 @@ void Master::_accept( }(); foreach (const TaskInfo& task, tasks) { + // Remove the task from being pending. + framework->pendingTasks.erase(task.task_id()); + if (slave != nullptr) { + slave->pendingTasks[framework->id()].erase(task.task_id()); + if (slave->pendingTasks[framework->id()].empty()) { + slave->pendingTasks.erase(framework->id()); + } + } + const TaskStatus::Reason reason = slave == nullptr ? TaskStatus::REASON_SLAVE_REMOVED : TaskStatus::REASON_SLAVE_DISCONNECTED; @@ -4183,6 +4192,21 @@ void Master::_accept( const ExecutorInfo& executor = operation.launch_group().executor(); const TaskGroupInfo& taskGroup = operation.launch_group().task_group(); + // Remove all the tasks from being pending. + hashset<TaskID> killed; + foreach (const TaskInfo& task, taskGroup.tasks()) { + bool pending = framework->pendingTasks.contains(task.task_id()); + framework->pendingTasks.erase(task.task_id()); + slave->pendingTasks[framework->id()].erase(task.task_id()); + if (slave->pendingTasks[framework->id()].empty()) { + slave->pendingTasks.erase(framework->id()); + } + + if (!pending) { + killed.insert(task.task_id()); + } + } + // Note that we do not fill in the `ExecutorInfo.framework_id` // since we do not have to support backwards compatiblity like // in the `Launch` operation case. @@ -4267,22 +4291,9 @@ void Master::_accept( continue; } - // Remove all the tasks from being pending. If any of the tasks - // have been killed in the interim, we will send TASK_KILLED - // for all other tasks in the group, since a TaskGroup must - // be delivered in its entirety. - hashset<TaskID> killed; - foreach (const TaskInfo& task, taskGroup.tasks()) { - bool pending = framework->pendingTasks.contains(task.task_id()); - framework->pendingTasks.erase(task.task_id()); - - if (!pending) { - killed.insert(task.task_id()); - } - } - // If task(s) were killed, send TASK_KILLED for - // all of the remaining tasks. + // all of the remaining tasks, since a TaskGroup must + // be delivered in its entirety. // // TODO(bmahler): Do this killing when processing // the `Kill` call, rather than doing it here. http://git-wip-us.apache.org/repos/asf/mesos/blob/e3836e96/src/tests/master_authorization_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp index 74422c7..eaf0f01 100644 --- a/src/tests/master_authorization_tests.cpp +++ b/src/tests/master_authorization_tests.cpp @@ -227,6 +227,17 @@ TEST_F(MasterAuthorizationTest, UnauthorizedTask) EXPECT_EQ(TASK_ERROR, status.get().state()); EXPECT_EQ(TaskStatus::REASON_TASK_UNAUTHORIZED, status.get().reason()); + // Make sure the task is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } @@ -326,13 +337,24 @@ TEST_F(MasterAuthorizationTest, UnauthorizedTaskGroup) EXPECT_EQ(TASK_ERROR, task2Status->state()); EXPECT_EQ(TaskStatus::REASON_TASK_GROUP_UNAUTHORIZED, task2Status->reason()); + // Make sure the task group is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } // This test verifies that a 'killTask()' that comes before -// '_launchTasks()' is called results in TASK_KILLED. +// '_accept()' is called results in TASK_KILLED. TEST_F(MasterAuthorizationTest, KillTask) { MockAuthorizer authorizer; @@ -398,6 +420,17 @@ TEST_F(MasterAuthorizationTest, KillTask) // returned to the allocator. AWAIT_READY(recoverResources); + // Make sure the task is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } @@ -517,13 +550,24 @@ TEST_F(MasterAuthorizationTest, KillPendingTaskInTaskGroup) // returned to the allocator. AWAIT_READY(recoverResources); + // Make sure the task group is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } // This test verifies that a slave removal that comes before -// '_launchTasks()' is called results in TASK_LOST. +// '_accept()' is called results in TASK_LOST. TEST_F(MasterAuthorizationTest, SlaveRemoved) { MockAuthorizer authorizer; @@ -609,6 +653,17 @@ TEST_F(MasterAuthorizationTest, SlaveRemoved) EXPECT_EQ( 1u, stats.values["master/task_lost/source_master/reason_slave_removed"]); + // Make sure the task is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } @@ -704,13 +759,24 @@ TEST_F(MasterAuthorizationTest, SlaveDisconnected) 1u, stats.values["master/task_lost/source_master/reason_slave_removed"]); + // Make sure the task is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } // This test verifies that a framework removal that comes before -// '_launchTasks()' is called results in recovery of resources. +// '_accept()' is called results in recovery of resources. TEST_F(MasterAuthorizationTest, FrameworkRemoved) { MockAuthorizer authorizer; http://git-wip-us.apache.org/repos/asf/mesos/blob/e3836e96/src/tests/master_validation_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_validation_tests.cpp b/src/tests/master_validation_tests.cpp index 16c5773..99e350e 100644 --- a/src/tests/master_validation_tests.cpp +++ b/src/tests/master_validation_tests.cpp @@ -709,6 +709,17 @@ TEST_F(TaskValidationTest, ExecutorUsesInvalidFrameworkID) EXPECT_TRUE(strings::startsWith( status.get().message(), "ExecutorInfo has an invalid FrameworkID")); + // Make sure the task is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } @@ -1806,6 +1817,17 @@ TEST_F(TaskGroupValidationTest, ExecutorUsesDockerContainerInfo) "Docker ContainerInfo is not supported on the executor", task2Status->message()); + // Make sure the task is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); } @@ -1885,6 +1907,17 @@ TEST_F(TaskGroupValidationTest, ExecutorWithoutFrameworkId) EXPECT_EQ(TASK_ERROR, task2Status->state()); EXPECT_EQ(TaskStatus::REASON_TASK_GROUP_INVALID, task2Status->reason()); + // Make sure the task is not known to master anymore. + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .Times(0); + + driver.reconcileTasks({}); + + // We pause the clock here to ensure any updates sent by the master + // are received. There shouldn't be any updates in this case. + Clock::pause(); + Clock::settle(); + driver.stop(); driver.join(); }