This is an automated email from the ASF dual-hosted git repository. holden pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 451ac4c4d96 [SPARK-38969][K8S] Fix Decom reporting 451ac4c4d96 is described below commit 451ac4c4d96f08df8176d9b7c872ab8968623b69 Author: Holden Karau <hol...@pigscanfly.ca> AuthorDate: Fri Aug 12 16:29:47 2022 -0700 [SPARK-38969][K8S] Fix Decom reporting ### What changes were proposed in this pull request? Change how we account for executor loss reasons. ### Why are the changes needed? Race condition in executors which decommission quickly. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing core tests. Closes #36434 from holdenk/SPARK-38969-decom-reporting. Authored-by: Holden Karau <hol...@pigscanfly.ca> Signed-off-by: Holden Karau <hka...@netflix.com> --- .../apache/spark/scheduler/dynalloc/ExecutorMonitor.scala | 10 ++++------ .../kubernetes/docker/src/main/dockerfiles/spark/decom.sh | 15 ++++++++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala index defef5bfcf2..9132d0e46d3 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala @@ -355,13 +355,11 @@ private[spark] class ExecutorMonitor( val removed = executors.remove(event.executorId) if (removed != null) { decrementExecResourceProfileCount(removed.resourceProfileId) - if (removed.decommissioning) { - if (event.reason == ExecutorLossMessage.decommissionFinished || - event.reason == ExecutorDecommission().message) { - metrics.gracefullyDecommissioned.inc() - } else { + if (event.reason == ExecutorLossMessage.decommissionFinished || + event.reason == ExecutorDecommission().message) { + metrics.gracefullyDecommissioned.inc() + } else if (removed.decommissioning) { metrics.decommissionUnfinished.inc() - } } else if (removed.pendingRemoval) { metrics.driverKilled.inc() } else { diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh index cd973df257f..4c25b42e964 100755 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh @@ -18,17 +18,22 @@ # -set -ex +set +e +set -x echo "Asked to decommission" # Find the pid to signal date | tee -a ${LOG} -WORKER_PID=$(ps -o pid -C java | tail -n 1| awk '{ sub(/^[ \t]+/, ""); print }') +WORKER_PID=$(ps -o pid,cmd -C java |grep Executor \ + | tail -n 1| awk '{ sub(/^[ \t]+/, ""); print }' \ + | cut -f 1 -d " ") echo "Using worker pid $WORKER_PID" kill -s SIGPWR ${WORKER_PID} -# For now we expect this to timeout, since we don't start exiting the backend. +# If the worker does exit stop blocking K8s cleanup. Note this is a "soft" +# block since the pod it's self will have a maximum decommissioning time which will +# overload this. echo "Waiting for worker pid to exit" -# If the worker does exit stop blocking the cleanup. -timeout 60 tail --pid=${WORKER_PID} -f /dev/null +tail --pid=${WORKER_PID} -f /dev/null +sleep 1 date echo "Done" date --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org