This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit a0baa345b2eb6483648fcbcd7af8aa9eac9148c3
Author: Joseph Wu <josep...@apache.org>
AuthorDate: Wed Feb 13 15:00:51 2019 -0800

    Modified when master responds to operation status updates.
    
    When dealing with orphaned operation status updates, there are two
    cases the master must deal with:
    - The simple case is when the master knows the framework is completed.
      These status updates can be acknowledged by the master.
    - However, a completed framework can be rotated out of the master's
      memory.  In addition, after master failover, if an agent reregisters
      before the framework, an operation can appear to be orphaned until
      the framework reregisters.
    
    This adds a fixed delay between agent reregistration and when the
    master acknowledges operation status updates from unknown frameworks.
    The delay should give frameworks ample time to reregister.
    
    The delay is based on agent reregistration in order to mitigate the
    delay of acknowledging status updates of frameworks rotated out of
    the completed frameworks buffer.
    
    Review: https://reviews.apache.org/r/69980
---
 src/master/constants.hpp |  7 +++++++
 src/master/master.cpp    | 30 +++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/master/constants.hpp b/src/master/constants.hpp
index b0ab918..8f729d1 100644
--- a/src/master/constants.hpp
+++ b/src/master/constants.hpp
@@ -106,6 +106,13 @@ constexpr size_t DEFAULT_MAX_COMPLETED_TASKS_PER_FRAMEWORK 
= 1000;
 // to store in the cache.
 constexpr size_t DEFAULT_MAX_UNREACHABLE_TASKS_PER_FRAMEWORK = 1000;
 
+// The minimum amount of time the master waits for a framework to reregister
+// before the master adopts any operations originating from that
+// framework. This applies to any framework not explicitly marked "completed"
+// in the master's memory.
+// Adopted operations will be acknowledged by the master.
+constexpr Duration MIN_WAIT_BEFORE_ORPHAN_OPERATION_ADOPTION = Minutes(10);
+
 // Time interval to check for updated watchers list.
 constexpr Duration WHITELIST_WATCH_INTERVAL = Seconds(5);
 
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 1e04d82..f71c6fd 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8725,9 +8725,13 @@ void 
Master::updateOperationStatus(UpdateOperationStatusMessage&& update)
 
   const OperationStatus& latestStatus = *operation->statuses().rbegin();
 
+  // Frameworks are sent operation status updates when the operation has
+  // a framework-specified ID and the framework is still running.
+  // Orphaned operations have no framework to send updates to.
   bool frameworkWillAcknowledge =
     operation->info().has_id() &&
-    !isCompletedFramework(frameworkId.get());
+    !isCompletedFramework(frameworkId.get()) &&
+    !slave->orphanedOperations.contains(operation->uuid());
 
   if (frameworkWillAcknowledge) {
     // Forward the status update to the framework.
@@ -8754,6 +8758,30 @@ void 
Master::updateOperationStatus(UpdateOperationStatusMessage&& update)
       // an operation ID or the associated framework terminated, so
       // the master has to send an acknowledgement.
 
+      // If an orphan operation belongs to a framework that is not
+      // marked "completed", there is a chance the framework will
+      // reregister in future. The master will drop these status
+      // updates until the framework reregisters or a certain amount
+      // of time has passed since the associated agent has reregistered.
+      //
+      // This behavior prevents the master from acknowledging operations
+      // directly after master failover, while both agents and frameworks
+      // reregister. If an agent with pending operations reregisters first,
+      // the operations may be considered orphans until the framework
+      // reregisters.
+      //
+      // NOTE: Frameworks rotated out of the master's completed frameworks
+      // buffer may also be affected by this wait.
+      if (operation->info().has_id() &&
+          slave->orphanedOperations.contains(operation->uuid()) &&
+          !isCompletedFramework(frameworkId.get())) {
+        if (slave->reregisteredTime.isSome() &&
+            (Clock::now() - slave->reregisteredTime.get()) <
+              MIN_WAIT_BEFORE_ORPHAN_OPERATION_ADOPTION) {
+          return;
+        }
+      }
+
       Result<ResourceProviderID> resourceProviderId =
         getResourceProviderId(operation->info());
 

Reply via email to