This is an automated email from the ASF dual-hosted git repository. holden pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 160b3be [SPARK-34764][CORE][K8S][UI] Propagate reason for exec loss to Web UI 160b3be is described below commit 160b3bee71ac5c4b5587fa16c8eef753e9a4ad91 Author: Holden Karau <hka...@apple.com> AuthorDate: Thu May 13 16:02:31 2021 -0700 [SPARK-34764][CORE][K8S][UI] Propagate reason for exec loss to Web UI ### What changes were proposed in this pull request? Adds the exec loss reason to the Spark web UI & in doing so also fix the Kube integration to pass exec loss reason into core. UI change: ![image](https://user-images.githubusercontent.com/59893/117045762-b975ba80-acc4-11eb-9679-8edab3cfadc2.png) ### Why are the changes needed? Debugging Spark jobs is *hard*, making it clearer why executors have exited could help. ### Does this PR introduce _any_ user-facing change? Yes a new column on the executor page. ### How was this patch tested? K8s unit test updated to validate exec loss reasons are passed through regardless of exec alive state, manual testing to validate the UI. Closes #32436 from holdenk/SPARK-34764-propegate-reason-for-exec-loss. Lead-authored-by: Holden Karau <hka...@apple.com> Co-authored-by: Holden Karau <hol...@pigscanfly.ca> Signed-off-by: Holden Karau <hka...@apple.com> --- .../spark/ui/static/executorspage-template.html | 1 + .../org/apache/spark/ui/static/executorspage.js | 15 +++++++++- .../cluster/k8s/ExecutorPodsLifecycleManager.scala | 34 ++++++++++++++++++++-- .../k8s/KubernetesClusterSchedulerBackend.scala | 4 +-- .../k8s/ExecutorPodsLifecycleManagerSuite.scala | 21 ++++++++----- .../KubernetesClusterSchedulerBackendSuite.scala | 4 +-- 6 files changed, 63 insertions(+), 16 deletions(-) diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html index be6d7bc..37d56a0 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html @@ -128,6 +128,7 @@ limitations under the License. Shuffle Write</span></th> <th>Logs</th> <th>Thread Dump</th> + <th>Exec Loss Reason</th> </tr> </thead> <tbody> diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index 2055c8f..5cc2868 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -31,6 +31,14 @@ function getThreadDumpEnabled() { return threadDumpEnabled; } +function formatLossReason(removeReason, type, row) { + if (removeReason) { + return removeReason + } else { + return "" + } +} + function formatStatus(status, type, row) { if (row.isExcluded) { return "Excluded"; @@ -132,7 +140,7 @@ function totalDurationColor(totalGCTime, totalDuration) { } var sumOptionalColumns = [3, 4]; -var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14]; +var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14, 15]; var execDataTable; var sumDataTable; @@ -543,6 +551,10 @@ $(document).ready(function () { data: 'id', render: function (data, type) { return type === 'display' ? ("<a href='threadDump/?executorId=" + data + "'>Thread Dump</a>" ) : data; } + }, + { + data: 'removeReason', + render: formatLossReason } ], "order": [[0, "asc"]], @@ -709,6 +721,7 @@ $(document).ready(function () { "<div id='direct_mapped_pool_memory' class='direct_mapped_pool_memory-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='10'> Peak Pool Memory Direct / Mapped</div>" + "<div id='extra_resources' class='resources-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='13'> Resources</div>" + "<div id='resource_prof_id' class='resource-prof-id-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='14'> Resource Profile Id</div>" + + "<div id='exec_loss_reason' class='exec-loss-reason-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='15'> Exec Loss Reason</div>" + "</div>"); reselectCheckboxesBasedOnTaskTableState(); diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala index 67e8f7e..e255de4 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala @@ -221,10 +221,17 @@ private[spark] class ExecutorPodsLifecycleManager( val pod = podState.pod val reason = Option(pod.getStatus.getReason) val message = Option(pod.getStatus.getMessage) + val explained = describeExitCode(exitCode) + val exitMsg = s"The executor with id $execId exited with exit code $explained." + val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}") + val msgStr = message.map(m => s"The API gave the following message: ${m}") + + s""" - |The executor with id $execId exited with exit code $exitCode. - |The API gave the following brief reason: ${reason.getOrElse("N/A")} - |The API gave the following message: ${message.getOrElse("N/A")} + |${exitMsg} + |${reasonStr.getOrElse("")} + |${msgStr.getOrElse("")} + | |The API gave the following container statuses: | |${containersDescription(pod)} @@ -246,4 +253,25 @@ private[spark] class ExecutorPodsLifecycleManager( private object ExecutorPodsLifecycleManager { val UNKNOWN_EXIT_CODE = -1 + + // A utility function to try and help people figure out whats gone wrong faster. + def describeExitCode(code: Int): String = { + val humanStr = code match { + case 0 => "(success)" + case 1 => "(generic, look at logs to clarify)" + case 42 => "(Douglas Adams fan)" + // Spark specific + case 10 | 50 => "(Uncaught exception)" + case 52 => "(JVM OOM)" + case 53 => "(DiskStore failed to create temp dir)" + // K8s & JVM specific exit codes + case 126 => "(not executable - possibly perm or arch)" + case 137 => "(SIGKILL, possible container OOM)" + case 139 => "(SIGSEGV: that's unexpected)" + case 255 => "(exit-1, your guess is as good as mine)" + case _ => "(unexpected)" + } + s"${code}${humanStr}" + } + } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala index d5a4856..5dad6a3 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala @@ -66,9 +66,7 @@ private[spark] class KubernetesClusterSchedulerBackend( // Allow removeExecutor to be accessible by ExecutorPodsLifecycleEventHandler private[k8s] def doRemoveExecutor(executorId: String, reason: ExecutorLossReason): Unit = { - if (isExecutorActive(executorId)) { - removeExecutor(executorId, reason) - } + removeExecutor(executorId, reason) } private def setUpExecutorConfigMap(driverPod: Option[Pod]): Unit = { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala index d84d28f..e3ec53a 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala @@ -69,7 +69,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte val failedPod = failedExecutorWithoutDeletion(1) snapshotsStore.updatePod(failedPod) snapshotsStore.notifySubscribers() - val msg = exitReasonMessage(1, failedPod) + val msg = exitReasonMessage(1, failedPod, 1) val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg) verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason) verify(namedExecutorPods(failedPod.getMetadata.getName)).delete() @@ -81,7 +81,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte snapshotsStore.notifySubscribers() snapshotsStore.updatePod(failedPod) snapshotsStore.notifySubscribers() - val msg = exitReasonMessage(1, failedPod) + val msg = exitReasonMessage(1, failedPod, 1) val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg) verify(schedulerBackend, times(1)).doRemoveExecutor("1", expectedLossReason) verify(namedExecutorPods(failedPod.getMetadata.getName), times(2)).delete() @@ -114,7 +114,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte eventHandlerUnderTest.conf.set(Config.KUBERNETES_DELETE_EXECUTORS, false) snapshotsStore.updatePod(failedPod) snapshotsStore.notifySubscribers() - val msg = exitReasonMessage(1, failedPod) + val msg = exitReasonMessage(1, failedPod, 1) val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg) verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason) verify(namedExecutorPods(failedPod.getMetadata.getName), never()).delete() @@ -126,13 +126,20 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte assert(pod.getMetadata().getLabels().get(SPARK_EXECUTOR_INACTIVE_LABEL) === "true") } - private def exitReasonMessage(failedExecutorId: Int, failedPod: Pod): String = { + private def exitReasonMessage(execId: Int, failedPod: Pod, exitCode: Int): String = { val reason = Option(failedPod.getStatus.getReason) val message = Option(failedPod.getStatus.getMessage) + val explained = ExecutorPodsLifecycleManager.describeExitCode(exitCode) + val exitMsg = s"The executor with id $execId exited with exit code $explained." + val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}") + val msgStr = message.map(m => s"The API gave the following message: ${m}") + + s""" - |The executor with id $failedExecutorId exited with exit code 1. - |The API gave the following brief reason: ${reason.getOrElse("N/A")} - |The API gave the following message: ${message.getOrElse("N/A")} + |${exitMsg} + |${reasonStr.getOrElse("")} + |${msgStr.getOrElse("")} + | |The API gave the following container statuses: | |${containersDescription(failedPod)} diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala index c4b878d..5dd84e8 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala @@ -158,10 +158,10 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn backend.start() backend.doRemoveExecutor("1", ExecutorKilled) - verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled)) + verify(driverEndpointRef).send(RemoveExecutor("1", ExecutorKilled)) backend.doRemoveExecutor("2", ExecutorKilled) - verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled)) + verify(driverEndpointRef).send(RemoveExecutor("2", ExecutorKilled)) } test("Kill executors") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org