Added a master flag to disallow agents without domain.

Added a new `--require_agent_domain` flag and implementation. When
set to true, it will cause the master to refuse (re-)registration
attempts for agents with no configured domain.

This is intended as a safety net for operators, who could forget to
configure the fault domain of a remote agent and let it join the
cluster. If this happens, an agent in a remote region will be
considered a local agent by the master and frameworks (because agent's
fault domain is not configured), causing tasks to potentially land in a
remote agent which is undesirable.

Review: https://reviews.apache.org/r/64507/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/79d18926
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/79d18926
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/79d18926

Branch: refs/heads/1.5.x
Commit: 79d189260eb705c73fd7fa356cef6e28351cb490
Parents: 4bc5f03
Author: Benno Evers <bev...@mesosphere.com>
Authored: Tue Jan 2 10:58:35 2018 -0800
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Tue Jan 2 11:45:33 2018 -0800

----------------------------------------------------------------------
 docs/configuration/master.md |  8 ++++
 src/master/flags.cpp         |  5 +++
 src/master/flags.hpp         |  1 +
 src/master/master.cpp        | 24 ++++++++++
 src/tests/master_tests.cpp   | 93 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 131 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/docs/configuration/master.md
----------------------------------------------------------------------
diff --git a/docs/configuration/master.md b/docs/configuration/master.md
index 6af16a0..3ff97ea 100644
--- a/docs/configuration/master.md
+++ b/docs/configuration/master.md
@@ -487,6 +487,14 @@ after which the operation is considered a failure. 
(default: 20secs)
 </tr>
 <tr>
   <td>
+    --[no-]require_agent_domain
+  </td>
+  <td>
+If true, only agents with a configured domain can register. (default: false)
+  </td>
+</tr>
+<tr>
+  <td>
     --roles=VALUE
   </td>
   <td>

http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/master/flags.cpp
----------------------------------------------------------------------
diff --git a/src/master/flags.cpp b/src/master/flags.cpp
index 18f405b..b49cb63 100644
--- a/src/master/flags.cpp
+++ b/src/master/flags.cpp
@@ -642,6 +642,11 @@ mesos::internal::master::Flags::Flags()
       "the IP address which the master will try to bind to.\n"
       "Cannot be used in conjunction with `--ip`.");
 
+  add(&Flags::require_agent_domain,
+      "require_agent_domain",
+      "If true, only agents with a configured domain can register.\n",
+      false);
+
   add(&Flags::domain,
       "domain",
       "Domain that the master belongs to. Mesos currently only supports\n"

http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/master/flags.hpp
----------------------------------------------------------------------
diff --git a/src/master/flags.hpp b/src/master/flags.hpp
index edda71a..dabb414 100644
--- a/src/master/flags.hpp
+++ b/src/master/flags.hpp
@@ -96,6 +96,7 @@ public:
   Duration registry_gc_interval;
   Duration registry_max_agent_age;
   size_t registry_max_agent_count;
+  bool require_agent_domain;
   Option<DomainInfo> domain;
 
   // The following flags are executable specific (e.g., since we only

http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 4528687..03eb178 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6207,6 +6207,18 @@ void Master::_registerSlave(
     return;
   }
 
+  // Don't allow agents without domain if domains are required.
+  // We don't shutdown the agent to allow it to restart itself with
+  // the correct domain and without losing tasks.
+  if (flags.require_agent_domain && !slaveInfo.has_domain()) {
+    LOG(WARNING) << "Agent at " << pid << " attempted to register without "
+                 << "a domain, but this master is configured to require agent "
+                 << "domains. Ignoring agent registration attempt";
+
+    slaves.registering.erase(pid);
+    return;
+  }
+
   // Check if this slave is already registered (because it retries).
   if (Slave* slave = slaves.registered.get(pid)) {
     if (!slave->connected) {
@@ -6578,6 +6590,18 @@ void Master::_reregisterSlave(
     return;
   }
 
+  // Don't allow agents without domain if domains are required.
+  // We don't shutdown the agent to allow it to restart itself with
+  // the correct domain and without losing tasks.
+  if (flags.require_agent_domain && !slaveInfo.has_domain()) {
+    LOG(WARNING) << "Agent at " << pid << " attempted to register without "
+                 << "a domain, but this master is configured to require agent "
+                 << "domains. Ignoring agent re-registration attempt";
+
+    slaves.reregistering.erase(slaveInfo.id());
+    return;
+  }
+
   if (Slave* slave = slaves.registered.get(slaveInfo.id())) {
     CHECK(!slaves.recovered.contains(slaveInfo.id()));
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/79d18926/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 3f9e5f4..5546fd9 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -8222,6 +8222,99 @@ TEST_F(MasterTest, AgentDomainMismatch)
 }
 
 
+// This test checks that if the `--require_agent_domain` flag is set and
+// the agent does not have a domain configured, the registration attempt
+// will fail.
+TEST_F(MasterTest, RegistrationWithoutRequiredAgentDomain)
+{
+  Clock::pause();
+
+  master::Flags masterFlags = CreateMasterFlags();
+  masterFlags.domain = createDomainInfo("region-abc", "zone-456");
+  masterFlags.require_agent_domain = true;
+
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Agent should attempt to register.
+  Future<RegisterSlaveMessage> registerSlaveMessage =
+    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
+
+  // If the agent is allowed to register, the master will update the
+  // registry. The agent should not be allowed to register, so we
+  // expect that no registrar operations will be observed.
+  EXPECT_CALL(*master.get()->registrar, apply(_))
+    .Times(0);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  slave::Flags slaveFlags = CreateSlaveFlags();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  Clock::advance(slaveFlags.registration_backoff_factor);
+  Clock::settle();
+
+  AWAIT_READY(registerSlaveMessage);
+}
+
+
+// This test checks that if the `--require_agent_domain` flag is set and
+// the agent does not have a domain configured when trying to re-register,
+// the re-registration attempt will fail.
+TEST_F(MasterTest, ReregistrationWithoutRequiredAgentDomain)
+{
+  Clock::pause();
+
+  master::Flags masterFlags = CreateMasterFlags();
+  masterFlags.domain = createDomainInfo("region-abc", "zone-456");
+  masterFlags.require_agent_domain = true;
+
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  slave::Flags slaveFlags = CreateSlaveFlags();
+  slaveFlags.domain = createDomainInfo("region-abc", "zone-456");
+
+  // Agent should successfully register.
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags);
+
+  Clock::advance(slaveFlags.registration_backoff_factor);
+
+  AWAIT_READY(slaveRegisteredMessage);
+
+  // Shut down slave and re-register without domain. We do this by
+  // intercepting the `ReregisterSlaveMessage` and erasing the domain
+  // before passing it on to the master.
+  Future<ReregisterSlaveMessage> reregisterSlaveMessage =
+    DROP_PROTOBUF(ReregisterSlaveMessage(), _, _);
+
+  slave.get()->shutdown();
+  slave = StartSlave(detector.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  Clock::advance(slaveFlags.authentication_backoff_factor);
+  Clock::advance(slaveFlags.registration_backoff_factor);
+
+  AWAIT_READY(reregisterSlaveMessage);
+
+  ReregisterSlaveMessage message = reregisterSlaveMessage.get();
+  message.mutable_slave()->clear_domain();
+
+  // If the agent is allowed to register, the master will update the
+  // registry. The agent should not be allowed to register, so we
+  // expect that no registrar operations will be observed.
+  EXPECT_CALL(*master.get()->registrar, apply(_))
+    .Times(0);
+
+  process::post(slave.get()->pid, master.get()->pid, message);
+  Clock::settle();
+}
+
+
 // This test checks that if the agent is configured with a domain but
 // the master is not, the agent is not allowed to re-register. This
 // might happen if the leading master is configured with a domain but

Reply via email to