This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 57d27e9 [SPARK-31125][K8S] Terminating pods have a deletion timestamp but they are not yet dead 57d27e9 is described below commit 57d27e900f79e6c5699b9a23db236aae98e761ad Author: Holden Karau <hka...@apple.com> AuthorDate: Tue Mar 17 12:04:06 2020 -0700 [SPARK-31125][K8S] Terminating pods have a deletion timestamp but they are not yet dead ### What changes were proposed in this pull request? Change what we consider a deleted pod to not include "Terminating" ### Why are the changes needed? If we get a new snapshot while a pod is in the process of being cleaned up we shouldn't delete the executor until it is fully terminated. ### Does this PR introduce any user-facing change? No ### How was this patch tested? This should be covered by the decommissioning tests in that they currently are flaky because we sometimes delete the executor instead of allowing it to decommission all the way. I also ran this in a loop locally ~80 times with the only failures being the PV suite because of unrelated minikube mount issues. Closes #27905 from holdenk/SPARK-31125-Processing-state-snapshots-incorrect. Authored-by: Holden Karau <hka...@apple.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../spark/scheduler/cluster/k8s/ExecutorPodStates.scala | 2 ++ .../spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala index 83daddf..34fca29 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala @@ -34,4 +34,6 @@ case class PodFailed(pod: Pod) extends FinalPodState case class PodDeleted(pod: Pod) extends FinalPodState +case class PodTerminating(pod: Pod) extends FinalPodState + case class PodUnknown(pod: Pod) extends ExecutorPodState diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala index 435a5f1..30030ab 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala @@ -64,6 +64,8 @@ object ExecutorPodsSnapshot extends Logging { PodFailed(pod) case "succeeded" => PodSucceeded(pod) + case "terminating" => + PodTerminating(pod) case _ => logWarning(s"Received unknown phase $phase for executor pod with name" + s" ${pod.getMetadata.getName} in namespace ${pod.getMetadata.getNamespace}") @@ -72,5 +74,12 @@ object ExecutorPodsSnapshot extends Logging { } } - private def isDeleted(pod: Pod): Boolean = pod.getMetadata.getDeletionTimestamp != null + private def isDeleted(pod: Pod): Boolean = { + (pod.getMetadata.getDeletionTimestamp != null && + ( + pod.getStatus == null || + pod.getStatus.getPhase == null || + pod.getStatus.getPhase.toLowerCase(Locale.ROOT) != "terminating" + )) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org