Repository: mesos Updated Branches: refs/heads/master 5e88bc076 -> d02d66043
Correctly reset slave status when aborting a registration. Previously, the slave was not erased from the `registering` and `reregistering` sets in the master for some code paths that would result in a failed (re-)registration attempt. This could lead to a state where the reason of the unsuccessful (re-)registration attempt is fixed on the agent, but the master ignores subsequent attempts because it assumes the previous operation is still in progress. Review: https://reviews.apache.org/r/64506/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/3eb57cae Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/3eb57cae Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/3eb57cae Branch: refs/heads/master Commit: 3eb57cae3674fc835c784cac9eaa63e1aab7ba1c Parents: 5e88bc0 Author: Benno Evers <bev...@mesosphere.com> Authored: Tue Jan 2 10:58:23 2018 -0800 Committer: Vinod Kone <vinodk...@gmail.com> Committed: Tue Jan 2 10:58:23 2018 -0800 ---------------------------------------------------------------------- src/master/master.cpp | 27 +++++++++++++++++++++++++++ src/tests/master_tests.cpp | 16 ++++++++++++++++ 2 files changed, 43 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/3eb57cae/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index 04378a8..4528687 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -6178,12 +6178,16 @@ void Master::_registerSlave( LOG(WARNING) << "Failed to parse version '" << version << "'" << " of agent at " << pid << ": " << parsedVersion.error() << "; ignoring agent registration attempt"; + + slaves.registering.erase(pid); return; } else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) { LOG(WARNING) << "Ignoring registration attempt from old agent at " << pid << ": agent version is " << parsedVersion.get() << ", minimum supported agent version is " << MINIMUM_AGENT_VERSION; + + slaves.registering.erase(pid); return; } @@ -6198,6 +6202,8 @@ void Master::_registerSlave( << "domain " << slaveInfo.domain() << " " << "but the master has no configured domain. " << "Ignoring agent registration attempt"; + + slaves.registering.erase(pid); return; } @@ -6418,12 +6424,15 @@ void Master::reregisterSlave( << slaveInfo.id() << " at " << from << " (" << slaveInfo.hostname() << ")"; + // TODO(bevers): Create a guard object calling `insert()` in its constructor + // and `erase()` in its destructor, to avoid the manual bookkeeping. slaves.reregistering.insert(slaveInfo.id()); // Update all resources passed by the agent to `POST_RESERVATION_REFINEMENT` // format. We do this as early as possible so that we only use a single // format inside master, and downgrade again if necessary when they leave the // master (e.g. when writing to the registry). + // // TODO(bevers): Also convert the resources in `ExecutorInfos` and `Tasks` // here for consistency. convertResourceFormat( @@ -6490,6 +6499,8 @@ void Master::_reregisterSlave( << "Ignoring re-register agent message from agent " << slaveInfo.id() << " at " << pid << " (" << slaveInfo.hostname() << ") as a gone operation is already in progress"; + + slaves.reregistering.erase(slaveInfo.id()); return; } @@ -6500,6 +6511,8 @@ void Master::_reregisterSlave( ShutdownMessage message; message.set_message("Agent has been marked gone"); send(pid, message); + + slaves.reregistering.erase(slaveInfo.id()); return; } @@ -6536,12 +6549,16 @@ void Master::_reregisterSlave( LOG(WARNING) << "Failed to parse version '" << version << "'" << " of agent at " << pid << ": " << parsedVersion.error() << "; ignoring agent re-registration attempt"; + + slaves.reregistering.erase(slaveInfo.id()); return; } else if (parsedVersion.get() < MINIMUM_AGENT_VERSION) { LOG(WARNING) << "Ignoring re-registration attempt from old agent at " << pid << ": agent version is " << parsedVersion.get() << ", minimum supported agent version is " << MINIMUM_AGENT_VERSION; + + slaves.reregistering.erase(slaveInfo.id()); return; } @@ -6556,6 +6573,8 @@ void Master::_reregisterSlave( << "domain " << slaveInfo.domain() << " " << "but the master has no configured domain." << "Ignoring agent re-registration attempt"; + + slaves.reregistering.erase(slaveInfo.id()); return; } @@ -6682,6 +6701,8 @@ void Master::__reregisterSlave( << "Ignoring re-register agent message from agent " << slaveInfo.id() << " at " << pid << " (" << slaveInfo.hostname() << ") as a gone operation is already in progress"; + + slaves.reregistering.erase(slaveInfo.id()); return; } @@ -6692,6 +6713,8 @@ void Master::__reregisterSlave( ShutdownMessage message; message.set_message("Agent has been marked gone"); send(pid, message); + + slaves.reregistering.erase(slaveInfo.id()); return; } @@ -6955,6 +6978,8 @@ void Master::___reregisterSlave( << "Ignoring re-register agent message from agent " << slaveInfo.id() << " at " << pid << " (" << slaveInfo.hostname() << ") as a gone operation is already in progress"; + + slaves.reregistering.erase(slaveInfo.id()); return; } @@ -6965,6 +6990,8 @@ void Master::___reregisterSlave( ShutdownMessage message; message.set_message("Agent has been marked gone"); send(pid, message); + + slaves.reregistering.erase(slaveInfo.id()); return; } http://git-wip-us.apache.org/repos/asf/mesos/blob/3eb57cae/src/tests/master_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 2393b27..3f9e5f4 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -8312,6 +8312,22 @@ TEST_F(MasterTest, IgnoreOldAgentRegistration) // Settle the clock to retire in-flight messages. Clock::settle(); + + // Now, try again with the correct version. The slave should be able to + // register. + Future<SlaveRegisteredMessage> slaveRegisteredMessage = + FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); + + EXPECT_CALL(*master.get()->registrar, apply(_)) + .Times(1); + + slave = StartSlave(detector.get(), slaveFlags); + ASSERT_SOME(slave); + + Clock::advance(slaveFlags.registration_backoff_factor); + Clock::settle(); + + AWAIT_READY(slaveRegisteredMessage); }