Removed dependency on "registry_strict" in master failover. When the master fails over, agents have `agent_reregister_timeout` to reregister with the new master. Any agents that fail to reregister within the timeout will be marked unreachable in the registry. Previously, frameworks would only receive a `slaveLost` callback for such agents if the master was running in "registry_strict" mode. This commit changes the master to always inform frameworks about lost agents, regardless of the "registry_strict" flag.
Review: https://reviews.apache.org/r/51955/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/905204e5 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/905204e5 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/905204e5 Branch: refs/heads/master Commit: 905204e54748c5842f96997db41f9ff14d9246ab Parents: ef9211f Author: Neil Conway <neil.con...@gmail.com> Authored: Mon Sep 19 15:49:23 2016 -0700 Committer: Vinod Kone <vinodk...@gmail.com> Committed: Mon Sep 19 15:49:23 2016 -0700 ---------------------------------------------------------------------- src/master/master.cpp | 27 ++++---------- src/tests/master_tests.cpp | 81 ----------------------------------------- 2 files changed, 7 insertions(+), 101 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/905204e5/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index 763c5e7..66a672f 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -1880,27 +1880,14 @@ Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave) TimeInfo unreachableTime = protobuf::getCurrentTime(); - if (flags.registry_strict) { - slaves.markingUnreachable.insert(slave.info().id()); + slaves.markingUnreachable.insert(slave.info().id()); - registrar->apply(Owned<Operation>( - new MarkSlaveUnreachable(slave.info(), unreachableTime))) - .onAny(defer(self(), - &Self::_markUnreachableAfterFailover, - slave.info(), - lambda::_1)); - } else { - // When a non-strict registry is in use, we want to ensure the - // registry is used in a write-only manner. Therefore we remove - // the slave from the registry but we do not inform the - // framework. - const string& message = - "Failed to mark agent " + stringify(slave.info().id()) + " unreachable"; - - registrar->apply(Owned<Operation>( - new MarkSlaveUnreachable(slave.info(), unreachableTime))) - .onFailed(lambda::bind(fail, message, lambda::_1)); - } + registrar->apply(Owned<Operation>( + new MarkSlaveUnreachable(slave.info(), unreachableTime))) + .onAny(defer(self(), + &Self::_markUnreachableAfterFailover, + slave.info(), + lambda::_1)); return Nothing(); } http://git-wip-us.apache.org/repos/asf/mesos/blob/905204e5/src/tests/master_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 6c49ab3..a32ac12 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -1967,87 +1967,6 @@ TEST_F(MasterTest, RecoveredSlaveCanReregister) } -// This test ensures that a non-strict registry is write-only by -// inducing a slave removal during recovery. After which, we expect -// that the framework is *not* informed, and we expect that the -// slave can re-register successfully. -TEST_F(MasterTest, NonStrictRegistryWriteOnly) -{ - // Step 1: Start a master. - master::Flags masterFlags = CreateMasterFlags(); - masterFlags.registry_strict = false; - - Try<Owned<cluster::Master>> master = StartMaster(masterFlags); - ASSERT_SOME(master); - - // Step 2: Start a slave. - Future<SlaveRegisteredMessage> slaveRegisteredMessage = - FUTURE_PROTOBUF(SlaveRegisteredMessage(), master.get()->pid, _); - - // Reuse slaveFlags so both StartSlave() use the same work_dir. - slave::Flags slaveFlags = this->CreateSlaveFlags(); - - Owned<MasterDetector> detector = master.get()->createDetector(); - Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags); - ASSERT_SOME(slave); - - AWAIT_READY(slaveRegisteredMessage); - - // Step 3: Stop the slave while the master is down. - master->reset(); - slave.get()->terminate(); - slave->reset(); - - // Step 4: Restart the master. - master = StartMaster(masterFlags); - ASSERT_SOME(master); - - // Step 5: Start a scheduler. - MockScheduler sched; - MesosSchedulerDriver driver( - &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); - - Future<Nothing> registered; - EXPECT_CALL(sched, registered(&driver, _, _)) - .WillOnce(FutureSatisfy(®istered)); - - EXPECT_CALL(sched, resourceOffers(&driver, _)) - .WillRepeatedly(Return()); // Ignore offers. - - driver.start(); - - AWAIT_READY(registered); - - // Step 6: Advance the clock and make sure the slave is not - // removed! - Future<Nothing> slaveLost; - EXPECT_CALL(sched, slaveLost(&driver, _)) - .WillRepeatedly(FutureSatisfy(&slaveLost)); - - Clock::pause(); - Clock::advance(masterFlags.agent_reregister_timeout); - Clock::settle(); - - ASSERT_TRUE(slaveLost.isPending()); - - Clock::resume(); - - // Step 7: Now expect the slave to be able to re-register, - // according to the non-strict semantics. - Future<SlaveReregisteredMessage> slaveReregisteredMessage = - FUTURE_PROTOBUF(SlaveReregisteredMessage(), master.get()->pid, _); - - detector = master.get()->createDetector(); - slave = StartSlave(detector.get(), slaveFlags); - ASSERT_SOME(slave); - - AWAIT_READY(slaveReregisteredMessage); - - driver.stop(); - driver.join(); -} - - // This test ensures that slave removals during master recovery // are rate limited. TEST_F(MasterTest, RateLimitRecoveredSlaveRemoval)