Repository: mesos
Updated Branches:
  refs/heads/master 8ab7a6f4d -> 8330e99f6


Added new TaskState values and PARTITION_AWARE capability.

TASK_DROPPED, TASK_UNREACHABLE, TASK_GONE, TASK_GONE_BY_OPERATOR, and
TASK_UNKNOWN. These values are intended to replace the existing
TASK_LOST state by offering more fine-grained information on the
current state of a task. These states will only be sent to frameworks
that opt into this new behavior via the PARTITION_AWARE capability.

Note that this commit doesn't add a master metric for the TASK_UNKNOWN
status, because this is a "default" status reported when the master has
no knowledge of a particular task/agent ID. Hence the number of
"unknown" tasks at any given time is not a well-defined metric.

Review: https://reviews.apache.org/r/50699/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c3268cad
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c3268cad
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c3268cad

Branch: refs/heads/master
Commit: c3268cad3621a6373ff331d882393b2ada064f4b
Parents: 8ab7a6f
Author: Neil Conway <neil.con...@gmail.com>
Authored: Fri Aug 26 14:47:53 2016 -0700
Committer: Vinod Kone <vinodk...@gmail.com>
Committed: Fri Aug 26 14:47:53 2016 -0700

----------------------------------------------------------------------
 include/mesos/mesos.proto            | 59 ++++++++++++++++++++++++++++++-
 include/mesos/v1/mesos.proto         | 59 ++++++++++++++++++++++++++++++-
 src/common/protobuf_utils.cpp        |  8 ++++-
 src/examples/disk_full_framework.cpp |  5 +++
 src/master/http.cpp                  | 19 +++++++++-
 src/master/master.cpp                | 42 ++++++++++++++++++----
 src/master/metrics.cpp               | 16 +++++++++
 src/master/metrics.hpp               |  4 +++
 src/tests/master_tests.cpp           |  4 ++-
 9 files changed, 205 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/include/mesos/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index a93db55..7fbcdf0 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -297,6 +297,24 @@ message FrameworkInfo {
       // Receive offers with resources that are shared.
       // TODO(anindya_sinha): This is currently a no-op.
       SHARED_RESOURCES = 4;
+
+      // Indicates that the framework is prepared to handle the
+      // following TaskStates: TASK_UNREACHABLE, TASK_DROPPED,
+      // TASK_GONE, TASK_GONE_BY_OPERATOR, and TASK_UNKNOWN.
+      //
+      // With this capability, frameworks can define how they would
+      // like to handle partitioned tasks. Frameworks will receive
+      // TASK_UNREACHABLE for tasks on partitioned agents; if/when the
+      // partitioned agent reregisters, the task will not be killed.
+      // Frameworks that enable this capability will never receive
+      // TASK_LOST; they will receive one of the most specific task
+      // statuses listed above instead.
+      //
+      // Without this capability, frameworks will receive TASK_LOST
+      // for tasks on partitioned agents; such tasks will be killed by
+      // Mesos when the agent reregisters (unless the master has
+      // failed over).
+      PARTITION_AWARE = 5;
     }
 
     // Enum fields should be optional, see: MESOS-4997.
@@ -1489,8 +1507,47 @@ enum TaskState {
   TASK_FINISHED = 2; // TERMINAL: The task finished successfully.
   TASK_FAILED = 3;   // TERMINAL: The task failed to finish successfully.
   TASK_KILLED = 4;   // TERMINAL: The task was killed by the executor.
-  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
   TASK_ERROR = 7;    // TERMINAL: The task description contains an error.
+
+  // This is only sent when the framework does NOT opt-in to the
+  // PARTITION_AWARE capability.
+  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
+
+  // The following task statuses are only sent when the framework
+  // opts-in to the PARTITION_AWARE capability.
+
+  // The task failed to launch because of a transient error. The
+  // task's executor never started running. Unlike TASK_ERROR, the
+  // task description is valid -- attempting to launch the task again
+  // may be successful. This is a terminal state.
+  TASK_DROPPED = 9;
+
+  // The task was running on an agent that has lost contact with the
+  // master, typically due to a network failure or partition. The task
+  // may or may not still be running.
+  TASK_UNREACHABLE = 10;
+
+  // The task was running on an agent that has been shutdown (e.g.,
+  // the agent become partitioned, rebooted, and then reconnected to
+  // the master; any tasks running before the reboot will transition
+  // from UNREACHABLE to GONE). The task is no longer running. This is
+  // a terminal state.
+  TASK_GONE = 11;
+
+  // The task was running on an agent that the master cannot contact;
+  // the operator has asserted that the agent has been shutdown, but
+  // this has not been directly confirmed by the master. If the
+  // operator is correct, the task is not running and this is a
+  // terminal state; if the operator is mistaken, the task might still
+  // be running, and might return to the RUNNING state in the future.
+  TASK_GONE_BY_OPERATOR = 12;
+
+  // The master has no knowledge of the task. This is typically
+  // because either (a) the master never had knowledge of the task, or
+  // (b) the master forgot about the task because it garbaged
+  // collected its metadata about the task. The task may or may not
+  // still be running.
+  TASK_UNKNOWN = 13;
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/include/mesos/v1/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index 4a7e998..60ec0cc 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -297,6 +297,24 @@ message FrameworkInfo {
       // Receive offers with resources that are shared.
       // TODO(anindya_sinha): This is currently a no-op.
       SHARED_RESOURCES = 4;
+
+      // Indicates that the framework is prepared to handle the
+      // following TaskStates: TASK_UNREACHABLE, TASK_DROPPED,
+      // TASK_GONE, TASK_GONE_BY_OPERATOR, and TASK_UNKNOWN.
+      //
+      // With this capability, frameworks can define how they would
+      // like to handle partitioned tasks. Frameworks will receive
+      // TASK_UNREACHABLE for tasks on partitioned agents; if/when the
+      // partitioned agent reregisters, the task will not be killed.
+      // Frameworks that enable this capability will never receive
+      // TASK_LOST; they will receive one of the most specific task
+      // statuses listed above instead.
+      //
+      // Without this capability, frameworks will receive TASK_LOST
+      // for tasks on partitioned agents; such tasks will be killed by
+      // Mesos when the agent reregisters (unless the master has
+      // failed over).
+      PARTITION_AWARE = 5;
     }
 
     // Enum fields should be optional, see: MESOS-4997.
@@ -1488,8 +1506,47 @@ enum TaskState {
   TASK_FINISHED = 2; // TERMINAL: The task finished successfully.
   TASK_FAILED = 3;   // TERMINAL: The task failed to finish successfully.
   TASK_KILLED = 4;   // TERMINAL: The task was killed by the executor.
-  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
   TASK_ERROR = 7;    // TERMINAL: The task description contains an error.
+
+  // This is only sent when the framework does NOT opt-in to the
+  // PARTITION_AWARE capability.
+  TASK_LOST = 5;     // TERMINAL: The task failed but can be rescheduled.
+
+  // The following task statuses are only sent when the framework
+  // opts-in to the PARTITION_AWARE capability.
+
+  // The task failed to launch because of a transient error. The
+  // task's executor never started running. Unlike TASK_ERROR, the
+  // task description is valid -- attempting to launch the task again
+  // may be successful. This is a terminal state.
+  TASK_DROPPED = 9;
+
+  // The task was running on an agent that has lost contact with the
+  // master, typically due to a network failure or partition. The task
+  // may or may not still be running.
+  TASK_UNREACHABLE = 10;
+
+  // The task was running on an agent that has been shutdown (e.g.,
+  // the agent become partitioned, rebooted, and then reconnected to
+  // the master; any tasks running before the reboot will transition
+  // from UNREACHABLE to GONE). The task is no longer running. This is
+  // a terminal state.
+  TASK_GONE = 11;
+
+  // The task was running on an agent that the master cannot contact;
+  // the operator has asserted that the agent has been shutdown, but
+  // this has not been directly confirmed by the master. If the
+  // operator is correct, the task is not running and this is a
+  // terminal state; if the operator is mistaken, the task might still
+  // be running, and might return to the RUNNING state in the future.
+  TASK_GONE_BY_OPERATOR = 12;
+
+  // The master has no knowledge of the task. This is typically
+  // because either (a) the master never had knowledge of the task, or
+  // (b) the master forgot about the task because it garbaged
+  // collected its metadata about the task. The task may or may not
+  // still be running.
+  TASK_UNKNOWN = 13;
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/common/protobuf_utils.cpp
----------------------------------------------------------------------
diff --git a/src/common/protobuf_utils.cpp b/src/common/protobuf_utils.cpp
index 8c4a726..ed3ac7f 100644
--- a/src/common/protobuf_utils.cpp
+++ b/src/common/protobuf_utils.cpp
@@ -76,11 +76,17 @@ bool frameworkHasCapability(
 
 bool isTerminalState(const TaskState& state)
 {
+  // TODO(neilc): Revise/rename this function. LOST, UNREACHABLE, and
+  // GONE_BY_OPERATOR are not truly "terminal".
   return (state == TASK_FINISHED ||
           state == TASK_FAILED ||
           state == TASK_KILLED ||
           state == TASK_LOST ||
-          state == TASK_ERROR);
+          state == TASK_ERROR ||
+          state == TASK_UNREACHABLE ||
+          state == TASK_DROPPED ||
+          state == TASK_GONE ||
+          state == TASK_GONE_BY_OPERATOR);
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/examples/disk_full_framework.cpp
----------------------------------------------------------------------
diff --git a/src/examples/disk_full_framework.cpp 
b/src/examples/disk_full_framework.cpp
index ad304fe..1221f83 100644
--- a/src/examples/disk_full_framework.cpp
+++ b/src/examples/disk_full_framework.cpp
@@ -230,6 +230,10 @@ public:
     case TASK_KILLED:
     case TASK_LOST:
     case TASK_ERROR:
+    case TASK_DROPPED:
+    case TASK_UNREACHABLE:
+    case TASK_GONE:
+    case TASK_GONE_BY_OPERATOR:
       if (flags.run_once) {
         driver->abort();
       }
@@ -241,6 +245,7 @@ public:
     case TASK_RUNNING:
     case TASK_STAGING:
     case TASK_KILLING:
+    case TASK_UNKNOWN:
       break;
     }
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index c6bdad6..525ef6c 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -2872,7 +2872,12 @@ struct TaskStateSummary
       killed(0),
       failed(0),
       lost(0),
-      error(0) {}
+      error(0),
+      dropped(0),
+      unreachable(0),
+      gone(0),
+      gone_by_operator(0),
+      unknown(0) {}
 
   // Account for the state of the given task.
   void count(const Task& task)
@@ -2887,6 +2892,11 @@ struct TaskStateSummary
       case TASK_FAILED: { ++failed; break; }
       case TASK_LOST: { ++lost; break; }
       case TASK_ERROR: { ++error; break; }
+      case TASK_DROPPED: { ++dropped; break; }
+      case TASK_UNREACHABLE: { ++unreachable; break; }
+      case TASK_GONE: { ++gone; break; }
+      case TASK_GONE_BY_OPERATOR: { ++gone_by_operator; break; }
+      case TASK_UNKNOWN: { ++unknown; break; }
       // No default case allows for a helpful compiler error if we
       // introduce a new state.
     }
@@ -2901,6 +2911,11 @@ struct TaskStateSummary
   size_t failed;
   size_t lost;
   size_t error;
+  size_t dropped;
+  size_t unreachable;
+  size_t gone;
+  size_t gone_by_operator;
+  size_t unknown;
 };
 
 
@@ -3045,6 +3060,7 @@ Future<Response> Master::Http::stateSummary(
               const TaskStateSummary& summary =
                 taskStateSummaries.slave(slave->id);
 
+              // TODO(neilc): Update for new PARTITION_AWARE task statuses.
               writer->field("TASK_STAGING", summary.staging);
               writer->field("TASK_STARTING", summary.starting);
               writer->field("TASK_RUNNING", summary.running);
@@ -3095,6 +3111,7 @@ Future<Response> Master::Http::stateSummary(
               const TaskStateSummary& summary =
                 taskStateSummaries.framework(frameworkId);
 
+              // TODO(neilc): Update for new PARTITION_AWARE task statuses.
               writer->field("TASK_STAGING", summary.staging);
               writer->field("TASK_STARTING", summary.starting);
               writer->field("TASK_RUNNING", summary.running);

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 5c00f33..ae38c1a 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -7094,12 +7094,42 @@ void Master::updateTask(Task* task, const StatusUpdate& 
update)
     }
 
     switch (status.state()) {
-      case TASK_FINISHED: ++metrics->tasks_finished; break;
-      case TASK_FAILED:   ++metrics->tasks_failed;   break;
-      case TASK_KILLED:   ++metrics->tasks_killed;   break;
-      case TASK_LOST:     ++metrics->tasks_lost;     break;
-      case TASK_ERROR:    ++metrics->tasks_error;    break;
-      default:                                       break;
+      case TASK_FINISHED:
+        ++metrics->tasks_finished;
+        break;
+      case TASK_FAILED:
+        ++metrics->tasks_failed;
+        break;
+      case TASK_KILLED:
+        ++metrics->tasks_killed;
+        break;
+      case TASK_LOST:
+        ++metrics->tasks_lost;
+        break;
+      case TASK_ERROR:
+        ++metrics->tasks_error;
+        break;
+      case TASK_UNREACHABLE:
+        ++metrics->tasks_unreachable;
+        break;
+      case TASK_DROPPED:
+        ++metrics->tasks_dropped;
+        break;
+      case TASK_GONE:
+        ++metrics->tasks_gone;
+        break;
+      case TASK_GONE_BY_OPERATOR:
+        ++metrics->tasks_gone_by_operator;
+        break;
+      case TASK_STARTING:
+      case TASK_STAGING:
+      case TASK_RUNNING:
+      case TASK_KILLING:
+        break;
+      case TASK_UNKNOWN:
+        // Should not happen.
+        LOG(FATAL) << "Unexpected TASK_UNKNOWN for in-memory task";
+        break;
     }
 
     if (status.has_reason()) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 88a752d..3d3338e 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -93,6 +93,14 @@ Metrics::Metrics(const Master& master)
         "master/tasks_lost"),
     tasks_error(
         "master/tasks_error"),
+    tasks_dropped(
+        "master/tasks_dropped"),
+    tasks_unreachable(
+        "master/tasks_unreachable"),
+    tasks_gone(
+        "master/tasks_gone"),
+    tasks_gone_by_operator(
+        "master/tasks_gone_by_operator"),
     dropped_messages(
         "master/dropped_messages"),
     messages_register_framework(
@@ -208,6 +216,10 @@ Metrics::Metrics(const Master& master)
   process::metrics::add(tasks_killed);
   process::metrics::add(tasks_lost);
   process::metrics::add(tasks_error);
+  process::metrics::add(tasks_dropped);
+  process::metrics::add(tasks_unreachable);
+  process::metrics::add(tasks_gone);
+  process::metrics::add(tasks_gone_by_operator);
 
   process::metrics::add(dropped_messages);
 
@@ -345,6 +357,10 @@ Metrics::~Metrics()
   process::metrics::remove(tasks_killed);
   process::metrics::remove(tasks_lost);
   process::metrics::remove(tasks_error);
+  process::metrics::remove(tasks_dropped);
+  process::metrics::remove(tasks_unreachable);
+  process::metrics::remove(tasks_gone);
+  process::metrics::remove(tasks_gone_by_operator);
 
   process::metrics::remove(dropped_messages);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index 9d201fc..cfddb4b 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -66,6 +66,10 @@ struct Metrics
   process::metrics::Counter tasks_killed;
   process::metrics::Counter tasks_lost;
   process::metrics::Counter tasks_error;
+  process::metrics::Counter tasks_dropped;
+  process::metrics::Counter tasks_unreachable;
+  process::metrics::Counter tasks_gone;
+  process::metrics::Counter tasks_gone_by_operator;
 
   typedef hashmap<TaskStatus::Reason, process::metrics::Counter> Reasons;
   typedef hashmap<TaskStatus::Source, Reasons> SourcesReasons;

http://git-wip-us.apache.org/repos/asf/mesos/blob/c3268cad/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 398164d..4c12615 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -3093,7 +3093,9 @@ TEST_F(MasterTest, StateEndpointFrameworkInfo)
 
   vector<FrameworkInfo::Capability::Type> capabilities = {
     FrameworkInfo::Capability::REVOCABLE_RESOURCES,
-    FrameworkInfo::Capability::TASK_KILLING_STATE
+    FrameworkInfo::Capability::TASK_KILLING_STATE,
+    FrameworkInfo::Capability::GPU_RESOURCES,
+    FrameworkInfo::Capability::PARTITION_AWARE
   };
 
   foreach (FrameworkInfo::Capability::Type capability, capabilities) {

Reply via email to