This is an automated email from the ASF dual-hosted git repository.

holden pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 160b3be  [SPARK-34764][CORE][K8S][UI] Propagate reason for exec loss 
to Web UI
160b3be is described below

commit 160b3bee71ac5c4b5587fa16c8eef753e9a4ad91
Author: Holden Karau <hka...@apple.com>
AuthorDate: Thu May 13 16:02:31 2021 -0700

    [SPARK-34764][CORE][K8S][UI] Propagate reason for exec loss to Web UI
    
    ### What changes were proposed in this pull request?
    
    Adds the exec loss reason to the Spark web UI & in doing so also fix the 
Kube integration to pass exec loss reason into core.
    
    UI change:
    
    
![image](https://user-images.githubusercontent.com/59893/117045762-b975ba80-acc4-11eb-9679-8edab3cfadc2.png)
    
    ### Why are the changes needed?
    
    Debugging Spark jobs is *hard*, making it clearer why executors have exited 
could help.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes a new column on the executor page.
    
    ### How was this patch tested?
    
    K8s unit test updated to validate exec loss reasons are passed through 
regardless of exec alive state, manual testing to validate the UI.
    
    Closes #32436 from holdenk/SPARK-34764-propegate-reason-for-exec-loss.
    
    Lead-authored-by: Holden Karau <hka...@apple.com>
    Co-authored-by: Holden Karau <hol...@pigscanfly.ca>
    Signed-off-by: Holden Karau <hka...@apple.com>
---
 .../spark/ui/static/executorspage-template.html    |  1 +
 .../org/apache/spark/ui/static/executorspage.js    | 15 +++++++++-
 .../cluster/k8s/ExecutorPodsLifecycleManager.scala | 34 ++++++++++++++++++++--
 .../k8s/KubernetesClusterSchedulerBackend.scala    |  4 +--
 .../k8s/ExecutorPodsLifecycleManagerSuite.scala    | 21 ++++++++-----
 .../KubernetesClusterSchedulerBackendSuite.scala   |  4 +--
 6 files changed, 63 insertions(+), 16 deletions(-)

diff --git 
a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
 
b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
index be6d7bc..37d56a0 100644
--- 
a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
+++ 
b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
@@ -128,6 +128,7 @@ limitations under the License.
               Shuffle Write</span></th>
           <th>Logs</th>
           <th>Thread Dump</th>
+          <th>Exec Loss Reason</th>
         </tr>
         </thead>
         <tbody>
diff --git 
a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js 
b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
index 2055c8f..5cc2868 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
@@ -31,6 +31,14 @@ function getThreadDumpEnabled() {
   return threadDumpEnabled;
 }
 
+function formatLossReason(removeReason, type, row) {
+    if (removeReason) {
+       return removeReason
+    } else {
+       return ""
+    }
+}
+
 function formatStatus(status, type, row) {
   if (row.isExcluded) {
     return "Excluded";
@@ -132,7 +140,7 @@ function totalDurationColor(totalGCTime, totalDuration) {
 }
 
 var sumOptionalColumns = [3, 4];
-var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14];
+var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14, 15];
 var execDataTable;
 var sumDataTable;
 
@@ -543,6 +551,10 @@ $(document).ready(function () {
               data: 'id', render: function (data, type) {
                 return type === 'display' ? ("<a 
href='threadDump/?executorId=" + data + "'>Thread Dump</a>" ) : data;
               }
+            },
+            {
+                data: 'removeReason',
+                render: formatLossReason
             }
           ],
           "order": [[0, "asc"]],
@@ -709,6 +721,7 @@ $(document).ready(function () {
           "<div id='direct_mapped_pool_memory' 
class='direct_mapped_pool_memory-checkbox-div'><input type='checkbox' 
class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='10'> Peak Pool Memory 
Direct / Mapped</div>" +
           "<div id='extra_resources' class='resources-checkbox-div'><input 
type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='13'> 
Resources</div>" +
           "<div id='resource_prof_id' 
class='resource-prof-id-checkbox-div'><input type='checkbox' class='toggle-vis' 
data-sum-col-idx='' data-exec-col-idx='14'> Resource Profile Id</div>" +
+          "<div id='exec_loss_reason' 
class='exec-loss-reason-checkbox-div'><input type='checkbox' class='toggle-vis' 
data-sum-col-idx='' data-exec-col-idx='15'> Exec Loss Reason</div>" +
           "</div>");
 
         reselectCheckboxesBasedOnTaskTableState();
diff --git 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
index 67e8f7e..e255de4 100644
--- 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
+++ 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
@@ -221,10 +221,17 @@ private[spark] class ExecutorPodsLifecycleManager(
     val pod = podState.pod
     val reason = Option(pod.getStatus.getReason)
     val message = Option(pod.getStatus.getMessage)
+    val explained = describeExitCode(exitCode)
+    val exitMsg = s"The executor with id $execId exited with exit code 
$explained."
+    val reasonStr = reason.map(r => s"The API gave the following brief reason: 
${r}")
+    val msgStr = message.map(m => s"The API gave the following message: ${m}")
+
+
     s"""
-       |The executor with id $execId exited with exit code $exitCode.
-       |The API gave the following brief reason: ${reason.getOrElse("N/A")}
-       |The API gave the following message: ${message.getOrElse("N/A")}
+       |${exitMsg}
+       |${reasonStr.getOrElse("")}
+       |${msgStr.getOrElse("")}
+       |
        |The API gave the following container statuses:
        |
        |${containersDescription(pod)}
@@ -246,4 +253,25 @@ private[spark] class ExecutorPodsLifecycleManager(
 
 private object ExecutorPodsLifecycleManager {
   val UNKNOWN_EXIT_CODE = -1
+
+  // A utility function to try and help people figure out whats gone wrong 
faster.
+  def describeExitCode(code: Int): String = {
+    val humanStr = code match {
+      case 0 => "(success)"
+      case 1 => "(generic, look at logs to clarify)"
+      case 42 => "(Douglas Adams fan)"
+      // Spark specific
+      case 10 | 50 => "(Uncaught exception)"
+      case 52 => "(JVM OOM)"
+      case 53 => "(DiskStore failed to create temp dir)"
+      // K8s & JVM specific exit codes
+      case 126 => "(not executable - possibly perm or arch)"
+      case 137 => "(SIGKILL, possible container OOM)"
+      case 139 => "(SIGSEGV: that's unexpected)"
+      case 255 => "(exit-1, your guess is as good as mine)"
+      case _ => "(unexpected)"
+    }
+    s"${code}${humanStr}"
+  }
+
 }
diff --git 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index d5a4856..5dad6a3 100644
--- 
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ 
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -66,9 +66,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   // Allow removeExecutor to be accessible by ExecutorPodsLifecycleEventHandler
   private[k8s] def doRemoveExecutor(executorId: String, reason: 
ExecutorLossReason): Unit = {
-    if (isExecutorActive(executorId)) {
-      removeExecutor(executorId, reason)
-    }
+    removeExecutor(executorId, reason)
   }
 
   private def setUpExecutorConfigMap(driverPod: Option[Pod]): Unit = {
diff --git 
a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala
 
b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala
index d84d28f..e3ec53a 100644
--- 
a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala
+++ 
b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala
@@ -69,7 +69,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite 
with BeforeAndAfte
     val failedPod = failedExecutorWithoutDeletion(1)
     snapshotsStore.updatePod(failedPod)
     snapshotsStore.notifySubscribers()
-    val msg = exitReasonMessage(1, failedPod)
+    val msg = exitReasonMessage(1, failedPod, 1)
     val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
     verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason)
     verify(namedExecutorPods(failedPod.getMetadata.getName)).delete()
@@ -81,7 +81,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite 
with BeforeAndAfte
     snapshotsStore.notifySubscribers()
     snapshotsStore.updatePod(failedPod)
     snapshotsStore.notifySubscribers()
-    val msg = exitReasonMessage(1, failedPod)
+    val msg = exitReasonMessage(1, failedPod, 1)
     val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
     verify(schedulerBackend, times(1)).doRemoveExecutor("1", 
expectedLossReason)
     verify(namedExecutorPods(failedPod.getMetadata.getName), times(2)).delete()
@@ -114,7 +114,7 @@ class ExecutorPodsLifecycleManagerSuite extends 
SparkFunSuite with BeforeAndAfte
     eventHandlerUnderTest.conf.set(Config.KUBERNETES_DELETE_EXECUTORS, false)
     snapshotsStore.updatePod(failedPod)
     snapshotsStore.notifySubscribers()
-    val msg = exitReasonMessage(1, failedPod)
+    val msg = exitReasonMessage(1, failedPod, 1)
     val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
     verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason)
     verify(namedExecutorPods(failedPod.getMetadata.getName), never()).delete()
@@ -126,13 +126,20 @@ class ExecutorPodsLifecycleManagerSuite extends 
SparkFunSuite with BeforeAndAfte
     assert(pod.getMetadata().getLabels().get(SPARK_EXECUTOR_INACTIVE_LABEL) 
=== "true")
   }
 
-  private def exitReasonMessage(failedExecutorId: Int, failedPod: Pod): String 
= {
+  private def exitReasonMessage(execId: Int, failedPod: Pod, exitCode: Int): 
String = {
     val reason = Option(failedPod.getStatus.getReason)
     val message = Option(failedPod.getStatus.getMessage)
+    val explained = ExecutorPodsLifecycleManager.describeExitCode(exitCode)
+    val exitMsg = s"The executor with id $execId exited with exit code 
$explained."
+    val reasonStr = reason.map(r => s"The API gave the following brief reason: 
${r}")
+    val msgStr = message.map(m => s"The API gave the following message: ${m}")
+
+
     s"""
-       |The executor with id $failedExecutorId exited with exit code 1.
-       |The API gave the following brief reason: ${reason.getOrElse("N/A")}
-       |The API gave the following message: ${message.getOrElse("N/A")}
+       |${exitMsg}
+       |${reasonStr.getOrElse("")}
+       |${msgStr.getOrElse("")}
+       |
        |The API gave the following container statuses:
        |
        |${containersDescription(failedPod)}
diff --git 
a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
 
b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index c4b878d..5dd84e8 100644
--- 
a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ 
b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -158,10 +158,10 @@ class KubernetesClusterSchedulerBackendSuite extends 
SparkFunSuite with BeforeAn
 
     backend.start()
     backend.doRemoveExecutor("1", ExecutorKilled)
-    verify(driverEndpointRef, never()).send(RemoveExecutor("1", 
ExecutorKilled))
+    verify(driverEndpointRef).send(RemoveExecutor("1", ExecutorKilled))
 
     backend.doRemoveExecutor("2", ExecutorKilled)
-    verify(driverEndpointRef, never()).send(RemoveExecutor("1", 
ExecutorKilled))
+    verify(driverEndpointRef).send(RemoveExecutor("2", ExecutorKilled))
   }
 
   test("Kill executors") {

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to