This is an automated email from the ASF dual-hosted git repository. bmahler pushed a commit to branch 1.8.x in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 4914cb74ff3472f96a10e324adb8b8626847da7e Author: Benjamin Mahler <bmah...@apache.org> AuthorDate: Mon Aug 31 19:56:34 2020 -0400 Added a regression test for MESOS-9609. Per MESOS-9609, it's possible for the master to encounter a CHECK failure during agent removal in the following situation: 1. Given a framework with checkpoint == false, with only executor(s) (no tasks) running on an agent: 2. When this agent disconects from the master, Master::removeFramework(Slave*, Framework*) removes the tasks and executors. However, when there are no tasks, this function will accidentally insert an entry into Master::Slave::tasks! (Due to the [] operator usage) 3. Now if the framework is removed, we have an entry in Slave::tasks, for which there is no corresponding framework. 4. When the agent is removed, we have a CHECK failure given we can't find the framework. This test runs through the above scenario, which no longer crashes given the fix applied. Review: https://reviews.apache.org/r/72830 --- src/tests/master_tests.cpp | 175 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 964d935..cedbb2b 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -79,6 +79,8 @@ #include "tests/resources_utils.hpp" #include "tests/utils.hpp" +namespace http = process::http; + using google::protobuf::RepeatedPtrField; using mesos::internal::master::Master; @@ -102,6 +104,7 @@ using mesos::v1::scheduler::Call; using mesos::v1::scheduler::Event; using process::Clock; +using process::Failure; using process::Future; using process::Message; using process::Owned; @@ -7616,6 +7619,178 @@ TEST_F(MasterTest, FailoverAgentReregisterFirst) } +// TODO(bmahler): Pull up a more generic helper for making it easy +// to send calls and get back responses. Ideally getting back the +// appropriate Response::X message (e.g. Response::GetAgents). +Future<v1::master::Response> post( + const process::PID<master::Master>& pid, + const v1::master::Call& call) +{ + http::Headers headers = createBasicAuthHeaders(DEFAULT_CREDENTIAL); + headers["Accept"] = stringify(ContentType::PROTOBUF); + + return http::post( + pid, + "api/v1", + headers, + serialize(ContentType::PROTOBUF, call), + stringify(ContentType::PROTOBUF)) + .then([](const http::Response& response) + -> Future<v1::master::Response> { + if (response.status != http::OK().status) { + return Failure("Unexpected response status " + response.status); + } + + return deserialize<v1::master::Response>( + ContentType::PROTOBUF, response.body); + }); +} + + +// This is a regression test for MESOS-9609. +// +// 1. Given a framework with checkpoint == false, with only +// executor(s) (no tasks) running on an agent: +// 2. When this agent disconects from the master, +// Master::removeFramework(Slave*, Framework*) removes the +// tasks and executors. However, when there are no tasks, this +// function previously accidentally inserted an entry into +// Master::Slave::tasks! (Due to a [] operator usage) +// 3. Now if the framework is removed, we have an entry in +// Slave::tasks, for which there is no corresponding framework. +// 4. When the agent is removed, we have a CHECK failure given +// we can't find the framework. +TEST_F(MasterTest, NonCheckpointingFrameworkAgentDisconnectionExecutorOnly) +{ + Try<Owned<cluster::Master>> master = StartMaster(); + ASSERT_SOME(master); + + Future<SlaveRegisteredMessage> slaveRegisteredMessage = + FUTURE_PROTOBUF(SlaveRegisteredMessage(), master.get()->pid, _); + + MockExecutor exec(DEFAULT_EXECUTOR_ID); + TestContainerizer containerizer(&exec); + + Owned<MasterDetector> detector = master.get()->createDetector(); + Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer); + ASSERT_SOME(slave); + + AWAIT_READY(slaveRegisteredMessage); + const SlaveID& slaveId = slaveRegisteredMessage->slave_id(); + + // Start a non-checkpointing framework, run a task that + // immediately completes but whose executor stays up. + FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; + frameworkInfo.set_checkpoint(false); + + MockScheduler sched; + TestingMesosSchedulerDriver driver( + &sched, detector.get(), DEFAULT_FRAMEWORK_INFO); + + Future<FrameworkID> frameworkId; + EXPECT_CALL(sched, registered(&driver, _, _)) + .WillOnce(FutureArg<1>(&frameworkId)); + + Future<vector<Offer>> offers; + EXPECT_CALL(sched, resourceOffers(&driver, _)) + .WillOnce(FutureArg<1>(&offers)) + .WillRepeatedly(Return()); // Ignore subsequent offers. + + driver.start(); + + AWAIT_READY(frameworkId); + + AWAIT_READY(offers); + ASSERT_FALSE(offers->empty()); + + const Offer& offer = offers->at(0); + + TaskInfo task; + task.set_name(""); + task.mutable_task_id()->set_value("1"); + *task.mutable_slave_id() = offer.slave_id(); + *task.mutable_resources() = offer.resources(); + *task.mutable_executor() = DEFAULT_EXECUTOR_INFO; + + EXPECT_CALL(exec, registered(_, _, _, _)); + EXPECT_CALL(exec, launchTask(_, _)) + .WillRepeatedly(SendStatusUpdateFromTask(TASK_FINISHED)); + + Future<TaskStatus> finishedStatus; + EXPECT_CALL(sched, statusUpdate(&driver, _)) + .WillOnce(FutureArg<1>(&finishedStatus)); + + driver.launchTasks(offer.id(), {task}); + + AWAIT_READY(finishedStatus); + EXPECT_EQ(TASK_FINISHED, finishedStatus->state()); + + // Make sure that the status update is acknowledged. + Clock::pause(); + Clock::settle(); + Clock::resume(); + + // Spoof an agent disconnection and check that it took effect. + process::inject::exited(slave.get()->pid, master.get()->pid); + + // Check that the agent disconnected. + { + v1::master::Call call; + call.set_type(v1::master::Call::GET_AGENTS); + + Future<v1::master::Response> response = post(master.get()->pid, call); + AWAIT_READY(response); + ASSERT_EQ(v1::master::Response::GET_AGENTS, response->type()); + + ASSERT_EQ(1, response->get_agents().agents_size()); + + const v1::master::Response::GetAgents::Agent& agent = + response->get_agents().agents(0); + + EXPECT_EQ(false, agent.active()); + } + + EXPECT_CALL(exec, shutdown(_)); + + // Now remove the framework, which should occur immediately upon + // disconnection since the framework's failover timeout is 0. + driver.stop(true); + + // Check to make sure the framework is removed. + { + v1::master::Call call; + call.set_type(v1::master::Call::GET_FRAMEWORKS); + + Future<v1::master::Response> response = post(master.get()->pid, call); + AWAIT_READY(response); + ASSERT_EQ(v1::master::Response::GET_FRAMEWORKS, response->type()); + + ASSERT_EQ(0, response->get_frameworks().frameworks_size()); + ASSERT_EQ(1, response->get_frameworks().completed_frameworks_size()); + } + + // Now remove the agent, which would trigger the crash in MESOS-9609. + { + v1::master::Call v1Call; + v1Call.set_type(v1::master::Call::MARK_AGENT_GONE); + + v1::master::Call::MarkAgentGone* markAgentGone = + v1Call.mutable_mark_agent_gone(); + + *markAgentGone->mutable_agent_id() = evolve(slaveId); + + Future<process::http::Response> response = process::http::post( + master.get()->pid, + "api/v1", + createBasicAuthHeaders(DEFAULT_CREDENTIAL), + serialize(ContentType::PROTOBUF, v1Call), + stringify(ContentType::PROTOBUF)); + + AWAIT_EXPECT_RESPONSE_STATUS_EQ(process::http::OK().status, response); + } +} + + // In this test, an agent restarts, responds to pings, but does not // reregister with the master; the master should mark the agent // unreachable after waiting for `agent_reregister_timeout`. In