Repository: mesos
Updated Branches:
  refs/heads/master 5e88bc076 -> d02d66043


Correctly reset slave status when aborting a registration.

Previously, the slave was not erased from the `registering`
and `reregistering` sets in the master for some code paths
that would result in a failed (re-)registration attempt.

This could lead to a state where the reason of the unsuccessful
(re-)registration attempt is fixed on the agent, but the master
ignores subsequent attempts because it assumes the previous
operation is still in progress.

Review: https://reviews.apache.org/r/64506/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/3eb57cae
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/3eb57cae
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/3eb57cae

Branch: refs/heads/master
Commit: 3eb57cae3674fc835c784cac9eaa63e1aab7ba1c
Parents: 5e88bc0
Author: Benno Evers <bev...@mesosphere.com>
Authored: Tue Jan 2 10:58:23 2018 -0800
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Tue Jan 2 10:58:23 2018 -0800

----------------------------------------------------------------------
 src/master/master.cpp      | 27 +++++++++++++++++++++++++++
 src/tests/master_tests.cpp | 16 ++++++++++++++++
 2 files changed, 43 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/3eb57cae/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 04378a8..4528687 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6178,12 +6178,16 @@ void Master::_registerSlave(
     LOG(WARNING) << "Failed to parse version '" << version << "'"
                  << " of agent at " << pid << ": " << parsedVersion.error()
                  << "; ignoring agent registration attempt";
+
+    slaves.registering.erase(pid);
     return;
   } else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) {
     LOG(WARNING) << "Ignoring registration attempt from old agent at "
                  << pid << ": agent version is " << parsedVersion.get()
                  << ", minimum supported agent version is "
                  << MINIMUM_AGENT_VERSION;
+
+    slaves.registering.erase(pid);
     return;
   }
 
@@ -6198,6 +6202,8 @@ void Master::_registerSlave(
                  << "domain " << slaveInfo.domain() << " "
                  << "but the master has no configured domain. "
                  << "Ignoring agent registration attempt";
+
+    slaves.registering.erase(pid);
     return;
   }
 
@@ -6418,12 +6424,15 @@ void Master::reregisterSlave(
             << slaveInfo.id() << " at " << from << " ("
             << slaveInfo.hostname() << ")";
 
+  // TODO(bevers): Create a guard object calling `insert()` in its constructor
+  // and `erase()` in its destructor, to avoid the manual bookkeeping.
   slaves.reregistering.insert(slaveInfo.id());
 
   // Update all resources passed by the agent to `POST_RESERVATION_REFINEMENT`
   // format. We do this as early as possible so that we only use a single
   // format inside master, and downgrade again if necessary when they leave the
   // master (e.g. when writing to the registry).
+  //
   // TODO(bevers): Also convert the resources in `ExecutorInfos` and `Tasks`
   // here for consistency.
   convertResourceFormat(
@@ -6490,6 +6499,8 @@ void Master::_reregisterSlave(
       << "Ignoring re-register agent message from agent "
       << slaveInfo.id() << " at " << pid << " ("
       << slaveInfo.hostname() << ") as a gone operation is already in 
progress";
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 
@@ -6500,6 +6511,8 @@ void Master::_reregisterSlave(
     ShutdownMessage message;
     message.set_message("Agent has been marked gone");
     send(pid, message);
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 
@@ -6536,12 +6549,16 @@ void Master::_reregisterSlave(
     LOG(WARNING) << "Failed to parse version '" << version << "'"
                  << " of agent at " << pid << ": " << parsedVersion.error()
                  << "; ignoring agent re-registration attempt";
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   } else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) {
     LOG(WARNING) << "Ignoring re-registration attempt from old agent at "
                  << pid << ": agent version is " << parsedVersion.get()
                  << ", minimum supported agent version is "
                  << MINIMUM_AGENT_VERSION;
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 
@@ -6556,6 +6573,8 @@ void Master::_reregisterSlave(
                  << "domain " << slaveInfo.domain() << " "
                  << "but the master has no configured domain."
                  << "Ignoring agent re-registration attempt";
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 
@@ -6682,6 +6701,8 @@ void Master::__reregisterSlave(
       << "Ignoring re-register agent message from agent "
       << slaveInfo.id() << " at " << pid << " ("
       << slaveInfo.hostname() << ") as a gone operation is already in 
progress";
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 
@@ -6692,6 +6713,8 @@ void Master::__reregisterSlave(
     ShutdownMessage message;
     message.set_message("Agent has been marked gone");
     send(pid, message);
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 
@@ -6955,6 +6978,8 @@ void Master::___reregisterSlave(
       << "Ignoring re-register agent message from agent "
       << slaveInfo.id() << " at " << pid << " ("
       << slaveInfo.hostname() << ") as a gone operation is already in 
progress";
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 
@@ -6965,6 +6990,8 @@ void Master::___reregisterSlave(
     ShutdownMessage message;
     message.set_message("Agent has been marked gone");
     send(pid, message);
+
+    slaves.reregistering.erase(slaveInfo.id());
     return;
   }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/3eb57cae/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 2393b27..3f9e5f4 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -8312,6 +8312,22 @@ TEST_F(MasterTest, IgnoreOldAgentRegistration)
 
   // Settle the clock to retire in-flight messages.
   Clock::settle();
+
+  // Now, try again with the correct version. The slave should be able to
+  // register.
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  EXPECT_CALL(*master.get()->registrar, apply(_))
+    .Times(1);
+
+  slave = StartSlave(detector.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  Clock::advance(slaveFlags.registration_backoff_factor);
+  Clock::settle();
+
+  AWAIT_READY(slaveRegisteredMessage);
 }
 
 

Reply via email to