This is an automated email from the ASF dual-hosted git repository.

bmahler pushed a commit to branch 1.9.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 598af2f6581c9e75fef39a8423091de369a57f3a
Author: Benjamin Mahler <bmah...@apache.org>
AuthorDate: Tue Sep 1 14:58:28 2020 -0400

    Fixed a CHECK failure in master during agent removal.
    
    Per MESOS-9609, it's possible for the master to encounter a CHECK
    failure during agent removal in the following situation:
    
      1. Given a framework with checkpoint == false, with only
         executor(s) (no tasks) running on an agent:
      2. When this agent disconects from the master,
         Master::removeFramework(Slave*, Framework*) removes the
         tasks and executors. However, when there are no tasks, this
         function will accidentally insert an entry into
         Master::Slave::tasks! (Due to the [] operator usage)
      3. Now if the framework is removed, we have an entry in
         Slave::tasks, for which there is no corresponding framework.
      4. When the agent is removed, we have a CHECK failure given
         we can't find the framework.
    
    This fixes the issue by avoiding the accidental insertion.
    
    Review: https://reviews.apache.org/r/72831
---
 src/master/master.cpp | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/master/master.cpp b/src/master/master.cpp
index f641adc..a2ddf7f 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -11457,30 +11457,32 @@ void Master::removeFramework(Slave* slave, Framework* 
framework)
 
   // Remove pointers to framework's tasks in slaves, and send status
   // updates.
-  // NOTE: A copy is needed because removeTask modifies slave->tasks.
-  foreachvalue (Task* task, utils::copy(slave->tasks[framework->id()])) {
-    // Remove tasks that belong to this framework.
-    if (task->framework_id() == framework->id()) {
-      // A framework might not actually exist because the master failed
-      // over and the framework hasn't reconnected yet. For more info
-      // please see the comments in 'removeFramework(Framework*)'.
-      const StatusUpdate& update = protobuf::createStatusUpdate(
-        task->framework_id(),
-        task->slave_id(),
-        task->task_id(),
-        TASK_LOST,
-        TaskStatus::SOURCE_MASTER,
-        None(),
-        "Agent " + slave->info.hostname() + " disconnected",
-        TaskStatus::REASON_SLAVE_DISCONNECTED,
-        (task->has_executor_id()
-            ? Option<ExecutorID>(task->executor_id()) : None()));
+  if (slave->tasks.contains(framework->id())) {
+    // NOTE: A copy is needed because removeTask modifies slave->tasks.
+    foreachvalue (Task* task, utils::copy(slave->tasks.at(framework->id()))) {
+      // Remove tasks that belong to this framework.
+      if (task->framework_id() == framework->id()) {
+        // A framework might not actually exist because the master failed
+        // over and the framework hasn't reconnected yet. For more info
+        // please see the comments in 'removeFramework(Framework*)'.
+        const StatusUpdate& update = protobuf::createStatusUpdate(
+          task->framework_id(),
+          task->slave_id(),
+          task->task_id(),
+          TASK_LOST,
+          TaskStatus::SOURCE_MASTER,
+          None(),
+          "Agent " + slave->info.hostname() + " disconnected",
+          TaskStatus::REASON_SLAVE_DISCONNECTED,
+          (task->has_executor_id()
+              ? Option<ExecutorID>(task->executor_id()) : None()));
 
-      updateTask(task, update);
-      removeTask(task);
+        updateTask(task, update);
+        removeTask(task);
 
-      if (framework->connected()) {
-        forward(update, UPID(), framework);
+        if (framework->connected()) {
+          forward(update, UPID(), framework);
+        }
       }
     }
   }

Reply via email to