This is an automated email from the ASF dual-hosted git repository. bmahler pushed a commit to branch 1.10.x in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 4379ab2f94fdcf35a6b808193fe9bf00765dcf40 Author: Benjamin Mahler <bmah...@apache.org> AuthorDate: Tue Sep 1 14:58:28 2020 -0400 Fixed a CHECK failure in master during agent removal. Per MESOS-9609, it's possible for the master to encounter a CHECK failure during agent removal in the following situation: 1. Given a framework with checkpoint == false, with only executor(s) (no tasks) running on an agent: 2. When this agent disconects from the master, Master::removeFramework(Slave*, Framework*) removes the tasks and executors. However, when there are no tasks, this function will accidentally insert an entry into Master::Slave::tasks! (Due to the [] operator usage) 3. Now if the framework is removed, we have an entry in Slave::tasks, for which there is no corresponding framework. 4. When the agent is removed, we have a CHECK failure given we can't find the framework. This fixes the issue by avoiding the accidental insertion. Review: https://reviews.apache.org/r/72831 --- src/master/master.cpp | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/master/master.cpp b/src/master/master.cpp index 6a013e3..c78d0df 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -10484,30 +10484,32 @@ void Master::removeFramework(Slave* slave, Framework* framework) // Remove pointers to framework's tasks in slaves, and send status // updates. - // NOTE: A copy is needed because removeTask modifies slave->tasks. - foreachvalue (Task* task, utils::copy(slave->tasks[framework->id()])) { - // Remove tasks that belong to this framework. - if (task->framework_id() == framework->id()) { - // A framework might not actually exist because the master failed - // over and the framework hasn't reconnected yet. For more info - // please see the comments in 'removeFramework(Framework*)'. - const StatusUpdate& update = protobuf::createStatusUpdate( - task->framework_id(), - task->slave_id(), - task->task_id(), - TASK_LOST, - TaskStatus::SOURCE_MASTER, - None(), - "Agent " + slave->info.hostname() + " disconnected", - TaskStatus::REASON_SLAVE_DISCONNECTED, - (task->has_executor_id() - ? Option<ExecutorID>(task->executor_id()) : None())); + if (slave->tasks.contains(framework->id())) { + // NOTE: A copy is needed because removeTask modifies slave->tasks. + foreachvalue (Task* task, utils::copy(slave->tasks.at(framework->id()))) { + // Remove tasks that belong to this framework. + if (task->framework_id() == framework->id()) { + // A framework might not actually exist because the master failed + // over and the framework hasn't reconnected yet. For more info + // please see the comments in 'removeFramework(Framework*)'. + const StatusUpdate& update = protobuf::createStatusUpdate( + task->framework_id(), + task->slave_id(), + task->task_id(), + TASK_LOST, + TaskStatus::SOURCE_MASTER, + None(), + "Agent " + slave->info.hostname() + " disconnected", + TaskStatus::REASON_SLAVE_DISCONNECTED, + (task->has_executor_id() + ? Option<ExecutorID>(task->executor_id()) : None())); - updateTask(task, update); - removeTask(task); + updateTask(task, update); + removeTask(task); - if (framework->connected()) { - forward(update, UPID(), framework); + if (framework->connected()) { + forward(update, UPID(), framework); + } } } }