Change registry update order on removal, mark-unreachable.

This commit changes the master so that we always update the registry
for an important change (e.g., removing an agent or marking an agent
unreachable), before updating the important parts of the master's
in-memory state (e.g., the in-memory list of registered agents).
Previously, the registry was updated first for registration and
reregistration, but second for removal and marking unreachable.
Updating the registry first is simpler, and also avoids possibly
leaking inaccurate information (e.g., via HTTP endpoints) if the
registry operation fails.

Review: https://reviews.apache.org/r/51374/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/88b2533a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/88b2533a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/88b2533a

Branch: refs/heads/master
Commit: 88b2533a72ff5897bc6cc5d7861a313c6fd7ea28
Parents: 0cb2480
Author: Neil Conway <neil.con...@gmail.com>
Authored: Mon Sep 19 15:47:53 2016 -0700
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Mon Sep 19 15:47:53 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp         | 332 ++++++++++++++++++-------------------
 src/master/master.hpp         |  16 +-
 src/tests/master_tests.cpp    |   1 +
 src/tests/partition_tests.cpp |  26 +--
 src/tests/slave_tests.cpp     |   1 +
 5 files changed, 193 insertions(+), 183 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/88b2533a/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 008ff32..71caea6 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1757,21 +1757,20 @@ Nothing Master::markUnreachableAfterFailover(const 
Registry::Slave& slave)
                << " after master failover; marking it unreachable";
 
   ++metrics->slave_unreachable_completed;
-  ++metrics->recovery_slave_removals;
 
   slaves.recovered.erase(slave.info().id());
 
+  TimeInfo unreachableTime = protobuf::getCurrentTime();
+
   if (flags.registry_strict) {
     slaves.markingUnreachable.insert(slave.info().id());
 
     registrar->apply(Owned<Operation>(
-        new MarkSlaveUnreachable(slave.info(), protobuf::getCurrentTime())))
+        new MarkSlaveUnreachable(slave.info(), unreachableTime)))
       .onAny(defer(self(),
-                   &Self::_markUnreachable,
+                   &Self::_markUnreachableAfterFailover,
                    slave.info(),
-                   vector<StatusUpdate>(), // No TASK_LOST updates to send.
-                   lambda::_1,
-                   "did not re-register after master failover"));
+                   lambda::_1));
   } else {
     // When a non-strict registry is in use, we want to ensure the
     // registry is used in a write-only manner. Therefore we remove
@@ -1781,7 +1780,7 @@ Nothing Master::markUnreachableAfterFailover(const 
Registry::Slave& slave)
       "Failed to mark agent " + stringify(slave.info().id()) + " unreachable";
 
     registrar->apply(Owned<Operation>(
-        new MarkSlaveUnreachable(slave.info(), protobuf::getCurrentTime())))
+        new MarkSlaveUnreachable(slave.info(), unreachableTime)))
       .onFailed(lambda::bind(fail, message, lambda::_1));
   }
 
@@ -1789,6 +1788,54 @@ Nothing Master::markUnreachableAfterFailover(const 
Registry::Slave& slave)
 }
 
 
+void Master::_markUnreachableAfterFailover(
+    const SlaveInfo& slaveInfo,
+    const Future<bool>& registrarResult)
+{
+  CHECK(slaves.markingUnreachable.contains(slaveInfo.id()));
+  slaves.markingUnreachable.erase(slaveInfo.id());
+
+  if (registrarResult.isFailed()) {
+    LOG(FATAL) << "Failed to mark agent " << slaveInfo.id()
+               << " (" << slaveInfo.hostname() << ")"
+               << " unreachable in the registry: "
+               << registrarResult.failure();
+  }
+
+  CHECK(!registrarResult.isDiscarded());
+
+  // `MarkSlaveUnreachable` registry operation should never fail.
+  CHECK(registrarResult.get());
+
+  LOG(INFO) << "Marked agent " << slaveInfo.id() << " ("
+            << slaveInfo.hostname() << ") unreachable: "
+            << "did not re-register after master failover";
+
+  ++metrics->slave_removals;
+  ++metrics->slave_removals_reason_unhealthy;
+  ++metrics->recovery_slave_removals;
+
+  sendSlaveLost(slaveInfo);
+}
+
+
+void Master::sendSlaveLost(const SlaveInfo& slaveInfo)
+{
+  foreachvalue (Framework* framework, frameworks.registered) {
+    LOG(INFO) << "Notifying framework " << *framework << " of lost agent "
+              << slaveInfo.id() << " (" << slaveInfo.hostname() << ")";
+
+    LostSlaveMessage message;
+    message.mutable_slave_id()->MergeFrom(slaveInfo.id());
+    framework->send(message);
+  }
+
+  if (HookManager::hooksAvailable()) {
+    HookManager::masterSlaveLostHook(slaveInfo);
+  }
+}
+
+
 void Master::fileAttached(const Future<Nothing>& result, const string& path)
 {
   if (result.isReady()) {
@@ -5679,6 +5726,54 @@ void Master::markUnreachable(const SlaveID& slaveId)
   CHECK(!slaves.unreachable.contains(slaveId));
   CHECK(slaves.removed.get(slaveId).isNone());
 
+  slaves.markingUnreachable.insert(slave->id);
+
+  // Use the same timestamp for all status updates sent below; we also
+  // use this timestamp when updating the registry.
+  TimeInfo unreachableTime = protobuf::getCurrentTime();
+
+  // Update the registry to move this slave from the list of admitted
+  // slaves to the list of unreachable slaves. After this is
+  // completed, we can update the master's in-memory state to remove
+  // the slave and send TASK_LOST status updates to the frameworks.
+  registrar->apply(Owned<Operation>(
+          new MarkSlaveUnreachable(slave->info, unreachableTime)))
+    .onAny(defer(self(),
+                 &Self::_markUnreachable,
+                 slave,
+                 unreachableTime,
+                 lambda::_1));
+}
+
+
+void Master::_markUnreachable(
+    Slave* slave,
+    TimeInfo unreachableTime,
+    const Future<bool>& registrarResult)
+{
+  CHECK_NOTNULL(slave);
+  CHECK(slaves.markingUnreachable.contains(slave->info.id()));
+  slaves.markingUnreachable.erase(slave->info.id());
+
+  if (registrarResult.isFailed()) {
+    LOG(FATAL) << "Failed to mark agent " << slave->info.id()
+               << " (" << slave->info.hostname() << ")"
+               << " unreachable in the registry: "
+               << registrarResult.failure();
+  }
+
+  CHECK(!registrarResult.isDiscarded());
+
+  // `MarkSlaveUnreachable` registry operation should never fail.
+  CHECK(registrarResult.get());
+
+  LOG(INFO) << "Marked agent " << slave->info.id() << " ("
+            << slave->info.hostname() << ") unreachable: "
+            << "health check timed out";
+
+  ++metrics->slave_removals;
+  ++metrics->slave_removals_reason_unhealthy;
+
   // We want to remove the slave first, to avoid the allocator
   // re-allocating the recovered resources.
   //
@@ -5689,17 +5784,9 @@ void Master::markUnreachable(const SlaveID& slaveId)
   // the slave is already removed.
   allocator->removeSlave(slave->id);
 
-  // Use the same timestamp for all status updates sent below; we also
-  // use this timestamp when updating the registry.
-  TimeInfo unreachableTime = protobuf::getCurrentTime();
-
-  // Transition the tasks to TASK_LOST and remove them, BUT do not
-  // send updates yet. Rather, build up the updates so that we can can
-  // send them after the slave has been moved to the unreachable list
-  // in the registry.
+  // Transition the tasks to TASK_LOST and remove them.
   // TODO(neilc): Update this to send TASK_UNREACHABLE for
-  // partition-aware frameworks.
-  vector<StatusUpdate> updates;
+  // PARTITION_AWARE frameworks.
   foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) {
     Framework* framework = getFramework(frameworkId);
     CHECK_NOTNULL(framework);
@@ -5724,7 +5811,7 @@ void Master::markUnreachable(const SlaveID& slaveId)
       updateTask(task, update);
       removeTask(task);
 
-      updates.push_back(update);
+      forward(update, UPID(), framework);
     }
   }
 
@@ -5759,7 +5846,6 @@ void Master::markUnreachable(const SlaveID& slaveId)
   slaves.registered.remove(slave);
   slaves.removed.put(slave->id, Nothing());
   slaves.unreachable[slave->id] = unreachableTime;
-  slaves.markingUnreachable.insert(slave->id);
   authenticated.erase(slave->pid);
 
   // Remove the slave from the `machines` mapping.
@@ -5774,81 +5860,12 @@ void Master::markUnreachable(const SlaveID& slaveId)
 
   // TODO(benh): unlink(slave->pid);
 
-  // Update the registry to move this slave from the list of admitted
-  // slaves to the list of unreachable slaves. Once this is completed,
-  // we can forward the TASK_LOST updates to the frameworks.
-  registrar->apply(Owned<Operation>(
-          new MarkSlaveUnreachable(slave->info, unreachableTime)))
-    .onAny(defer(self(),
-                 &Self::_markUnreachable,
-                 slave->info,
-                 updates,
-                 lambda::_1,
-                 "health check timed out"));
+  sendSlaveLost(slave->info);
 
   delete slave;
 }
 
 
-void Master::_markUnreachable(
-    const SlaveInfo& slaveInfo,
-    const vector<StatusUpdate>& updates,
-    const Future<bool>& registrarResult,
-    const string& unreachableCause)
-{
-  CHECK(slaves.markingUnreachable.contains(slaveInfo.id()));
-  slaves.markingUnreachable.erase(slaveInfo.id());
-
-  if (registrarResult.isFailed()) {
-    LOG(FATAL) << "Failed to mark agent " << slaveInfo.id()
-               << " (" << slaveInfo.hostname() << ")"
-               << " unreachable in the registry: "
-               << registrarResult.failure();
-  }
-
-  CHECK(!registrarResult.isDiscarded());
-
-  // `MarkSlaveUnreachable` registry operation should never fail.
-  CHECK(registrarResult.get());
-
-  LOG(INFO) << "Marked agent " << slaveInfo.id() << " ("
-            << slaveInfo.hostname() << ") unreachable: "
-            << unreachableCause;
-
-  // TODO(neilc): Consider renaming these metrics or adding new
-  // metrics for the new PARTITION_AWARE semantics.
-  ++metrics->slave_removals;
-  ++metrics->slave_removals_reason_unhealthy;
-
-  // Forward the TASK_LOST updates on to the frameworks.
-  foreach (const StatusUpdate& update, updates) {
-    Framework* framework = getFramework(update.framework_id());
-
-    if (framework == nullptr) {
-      LOG(WARNING) << "Dropping update " << update << " from unknown framework 
"
-                   << update.framework_id();
-    } else {
-      forward(update, UPID(), framework);
-    }
-  }
-
-  // Notify all frameworks of the lost slave.
-  foreachvalue (Framework* framework, frameworks.registered) {
-    LOG(INFO) << "Notifying framework " << *framework << " of lost agent "
-              << slaveInfo.id() << " (" << slaveInfo.hostname() << ")";
-
-    LostSlaveMessage message;
-    message.mutable_slave_id()->MergeFrom(slaveInfo.id());
-    framework->send(message);
-  }
-
-  // Finally, notify the `SlaveLost` hooks.
-  if (HookManager::hooksAvailable()) {
-    HookManager::masterSlaveLostHook(slaveInfo);
-  }
-}
-
-
 void Master::reconcile(
     Framework* framework,
     const scheduler::Call::Reconcile& reconcile)
@@ -7150,6 +7167,55 @@ void Master::removeSlave(
 
   LOG(INFO) << "Removing agent " << *slave << ": " << message;
 
+  slaves.removing.insert(slave->id);
+
+  // Remove this slave from the registrar. Note that we update the
+  // registry BEFORE we update the master's in-memory state; this
+  // means that until the registry operation has completed, the slave
+  // is not considered to be removed (so we might offer its resources
+  // to frameworks, etc.). Ensuring that the registry update succeeds
+  // before we modify in-memory state ensures that external clients
+  // see consistent behavior if the master fails over.
+  registrar->apply(Owned<Operation>(new RemoveSlave(slave->info)))
+    .onAny(defer(self(),
+                 &Self::_removeSlave,
+                 slave,
+                 lambda::_1,
+                 message,
+                 reason));
+}
+
+
+void Master::_removeSlave(
+    Slave* slave,
+    const Future<bool>& registrarResult,
+    const string& removalCause,
+    Option<Counter> reason)
+{
+  CHECK_NOTNULL(slave);
+  CHECK(slaves.removing.contains(slave->info.id()));
+  slaves.removing.erase(slave->info.id());
+
+  CHECK(!registrarResult.isDiscarded());
+
+  if (registrarResult.isFailed()) {
+    LOG(FATAL) << "Failed to remove agent " << slave->info.id()
+               << " (" << slave->info.hostname() << ")"
+               << " from the registrar: " << registrarResult.failure();
+  }
+
+  CHECK(registrarResult.get())
+    << "Agent " << slave->info.id() << " (" << slave->info.hostname() << ") "
+    << "already removed from the registrar";
+
+  LOG(INFO) << "Removed agent " << slave->info.id() << " ("
+            << slave->info.hostname() << "): " << removalCause;
+
+  ++metrics->slave_removals;
+  if (reason.isSome()) {
+    ++utils::copy(reason.get()); // Remove const.
+  }
+
   // We want to remove the slave first, to avoid the allocator
   // re-allocating the recovered resources.
   //
@@ -7160,10 +7226,7 @@ void Master::removeSlave(
   // the slave is already removed.
   allocator->removeSlave(slave->id);
 
-  // Transition the tasks to lost and remove them, BUT do not send
-  // updates. Rather, build up the updates so that we can send them
-  // after the slave is removed from the registry.
-  vector<StatusUpdate> updates;
+  // Transition the tasks to lost and remove them.
   foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) {
     foreachvalue (Task* task, utils::copy(slave->tasks[frameworkId])) {
       const StatusUpdate& update = protobuf::createStatusUpdate(
@@ -7173,7 +7236,7 @@ void Master::removeSlave(
           TASK_LOST,
           TaskStatus::SOURCE_MASTER,
           None(),
-          "Slave " + slave->info.hostname() + " removed: " + message,
+          "Slave " + slave->info.hostname() + " removed: " + removalCause,
           TaskStatus::REASON_SLAVE_REMOVED,
           (task->has_executor_id() ?
               Option<ExecutorID>(task->executor_id()) : None()));
@@ -7181,7 +7244,13 @@ void Master::removeSlave(
       updateTask(task, update);
       removeTask(task);
 
-      updates.push_back(update);
+      Framework* framework = getFramework(frameworkId);
+      if (framework == nullptr) {
+        LOG(WARNING) << "Dropping update " << update
+                     << " for unknown framework " << frameworkId;
+      } else {
+        forward(update, UPID(), framework);
+      }
     }
   }
 
@@ -7216,7 +7285,6 @@ void Master::removeSlave(
   slave->pendingTasks.clear();
 
   // Mark the slave as being removed.
-  slaves.removing.insert(slave->id);
   slaves.registered.remove(slave);
   slaves.removed.put(slave->id, Nothing());
   authenticated.erase(slave->pid);
@@ -7233,82 +7301,12 @@ void Master::removeSlave(
 
   // TODO(benh): unlink(slave->pid);
 
-  // Remove this slave from the registrar. Once this is completed, we
-  // can forward the LOST task updates to the frameworks and notify
-  // all frameworks that this slave was lost.
-  registrar->apply(Owned<Operation>(new RemoveSlave(slave->info)))
-    .onAny(defer(self(),
-                 &Self::_removeSlave,
-                 slave->info,
-                 updates,
-                 lambda::_1,
-                 message,
-                 reason));
+  sendSlaveLost(slave->info);
 
   delete slave;
 }
 
 
-void Master::_removeSlave(
-    const SlaveInfo& slaveInfo,
-    const vector<StatusUpdate>& updates,
-    const Future<bool>& registrarResult,
-    const string& removalCause,
-    Option<Counter> reason)
-{
-  CHECK(slaves.removing.contains(slaveInfo.id()));
-  slaves.removing.erase(slaveInfo.id());
-
-  CHECK(!registrarResult.isDiscarded());
-
-  if (registrarResult.isFailed()) {
-    LOG(FATAL) << "Failed to remove agent " << slaveInfo.id()
-               << " (" << slaveInfo.hostname() << ")"
-               << " from the registrar: " << registrarResult.failure();
-  }
-
-  CHECK(registrarResult.get())
-    << "Agent " << slaveInfo.id() << " (" << slaveInfo.hostname() << ") "
-    << "already removed from the registrar";
-
-  LOG(INFO) << "Removed agent " << slaveInfo.id() << " ("
-            << slaveInfo.hostname() << "): " << removalCause;
-
-  ++metrics->slave_removals;
-
-  if (reason.isSome()) {
-    ++utils::copy(reason.get()); // Remove const.
-  }
-
-  // Forward the LOST updates on to the framework.
-  foreach (const StatusUpdate& update, updates) {
-    Framework* framework = getFramework(update.framework_id());
-
-    if (framework == nullptr) {
-      LOG(WARNING) << "Dropping update " << update << " from unknown framework 
"
-                   << update.framework_id();
-    } else {
-      forward(update, UPID(), framework);
-    }
-  }
-
-  // Notify all frameworks of the lost slave.
-  foreachvalue (Framework* framework, frameworks.registered) {
-    LOG(INFO) << "Notifying framework " << *framework << " of lost agent "
-              << slaveInfo.id() << " (" << slaveInfo.hostname() << ")";
-
-    LostSlaveMessage message;
-    message.mutable_slave_id()->MergeFrom(slaveInfo.id());
-    framework->send(message);
-  }
-
-  // Finally, notify the `SlaveLost` hooks.
-  if (HookManager::hooksAvailable()) {
-    HookManager::masterSlaveLostHook(slaveInfo);
-  }
-}
-
-
 void Master::updateTask(Task* task, const StatusUpdate& update)
 {
   CHECK_NOTNULL(task);

http://git-wip-us.apache.org/repos/asf/mesos/blob/88b2533a/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 7e50359..714aa79 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -685,15 +685,20 @@ protected:
         std::vector<Archive::Framework>());
 
   void _markUnreachable(
-      const SlaveInfo& slaveInfo,
-      const std::vector<StatusUpdate>& updates,
-      const process::Future<bool>& registrarResult,
-      const std::string& unreachableCause);
+      Slave* slave,
+      TimeInfo unreachableTime,
+      const process::Future<bool>& registrarResult);
 
   // Mark a slave as unreachable in the registry. Called when the slave
   // does not re-register in time after a master failover.
   Nothing markUnreachableAfterFailover(const Registry::Slave& slave);
 
+  void _markUnreachableAfterFailover(
+      const SlaveInfo& slaveInfo,
+      const process::Future<bool>& registrarResult);
+
+  void sendSlaveLost(const SlaveInfo& slaveInfo);
+
   // Remove the slave from the registrar and from the master's state.
   //
   // TODO(bmahler): 'reason' is optional until MESOS-2317 is resolved.
@@ -703,8 +708,7 @@ protected:
       Option<process::metrics::Counter> reason = None());
 
   void _removeSlave(
-      const SlaveInfo& slaveInfo,
-      const std::vector<StatusUpdate>& updates,
+      Slave* slave,
       const process::Future<bool>& registrarResult,
       const std::string& removalCause,
       Option<process::metrics::Counter> reason = None());

http://git-wip-us.apache.org/repos/asf/mesos/blob/88b2533a/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 514c1d9..85ebd57 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -1821,6 +1821,7 @@ TEST_F(MasterTest, RecoveredSlaveCanReregister)
   EXPECT_EQ(1, stats.values["master/recovery_slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
   EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
   EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/88b2533a/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index f962f84..997eda3 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -156,6 +156,7 @@ TEST_P(PartitionTest, PartitionedSlave)
   EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
 
   driver.stop();
   driver.join();
@@ -430,6 +431,7 @@ TEST_P(PartitionTest, ReregisterSlaveNotPartitionAware)
   EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
 
   // We now complete the partition on the slave side as well. We
   // simulate a master loss event, which would normally happen during
@@ -637,9 +639,10 @@ TEST_P(PartitionTest, 
PartitionedSlaveReregistrationMasterFailover)
   EXPECT_EQ(slaveId, lostStatus.get().slave_id());
   EXPECT_EQ(partitionTime, lostStatus.get().unreachable_time());
 
-  // `sched2` should see TASK_UNREACHABLE.
+  // `sched2` should see TASK_LOST.
+  // TODO(neilc): Update this to expect TASK_UNREACHABLE.
   AWAIT_READY(unreachableStatus);
-  EXPECT_EQ(TASK_UNREACHABLE, unreachableStatus.get().state());
+  EXPECT_EQ(TASK_LOST, unreachableStatus.get().state());
   EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, 
unreachableStatus.get().reason());
   EXPECT_EQ(task2.task_id(), unreachableStatus.get().task_id());
   EXPECT_EQ(slaveId, unreachableStatus.get().slave_id());
@@ -799,9 +802,9 @@ TEST_P(PartitionTest, PartitionedSlaveOrphanedTask)
 
   // Now, induce a partition of the slave by having the master
   // timeout the slave.
-  Future<TaskStatus> unreachableStatus;
+  Future<TaskStatus> lostStatus;
   EXPECT_CALL(sched, statusUpdate(&driver, _))
-    .WillOnce(FutureArg<1>(&unreachableStatus));
+    .WillOnce(FutureArg<1>(&lostStatus));
 
   Future<Nothing> slaveLost;
   EXPECT_CALL(sched, slaveLost(&driver, _))
@@ -829,12 +832,13 @@ TEST_P(PartitionTest, PartitionedSlaveOrphanedTask)
 
   Clock::advance(Milliseconds(100));
 
-  AWAIT_READY(unreachableStatus);
-  EXPECT_EQ(TASK_UNREACHABLE, unreachableStatus.get().state());
-  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, 
unreachableStatus.get().reason());
-  EXPECT_EQ(task.task_id(), unreachableStatus.get().task_id());
-  EXPECT_EQ(slaveId, unreachableStatus.get().slave_id());
-  EXPECT_EQ(partitionTime, unreachableStatus.get().unreachable_time());
+  // TODO(neilc): Update this to expect `TASK_UNREACHABLE`.
+  AWAIT_READY(lostStatus);
+  EXPECT_EQ(TASK_LOST, lostStatus.get().state());
+  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, lostStatus.get().reason());
+  EXPECT_EQ(task.task_id(), lostStatus.get().task_id());
+  EXPECT_EQ(slaveId, lostStatus.get().slave_id());
+  EXPECT_EQ(partitionTime, lostStatus.get().unreachable_time());
 
   AWAIT_READY(slaveLost);
 
@@ -1079,6 +1083,7 @@ TEST_P(PartitionTest, PartitionedSlaveStatusUpdates)
   EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
 
   // At this point, the slave still thinks it's registered, so we
   // simulate a status update coming from the slave.
@@ -1252,6 +1257,7 @@ TEST_P(PartitionTest, PartitionedSlaveExitedExecutor)
   EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
 
   EXPECT_CALL(sched, executorLost(&driver, _, _, _))
     .Times(0);

http://git-wip-us.apache.org/repos/asf/mesos/blob/88b2533a/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 2e7accc..aa30118 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -165,6 +165,7 @@ TEST_F(SlaveTest, Shutdown)
   JSON::Object stats = Metrics();
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unregistered"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unhealthy"]);
 
   driver.stop();
   driver.join();

Reply via email to