This is an automated email from the ASF dual-hosted git repository.

holden pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 451ac4c4d96 [SPARK-38969][K8S] Fix Decom reporting
451ac4c4d96 is described below

commit 451ac4c4d96f08df8176d9b7c872ab8968623b69
Author: Holden Karau <hol...@pigscanfly.ca>
AuthorDate: Fri Aug 12 16:29:47 2022 -0700

    [SPARK-38969][K8S] Fix Decom reporting
    
    ### What changes were proposed in this pull request?
    
    Change how we account for executor loss reasons.
    
    ### Why are the changes needed?
    
    Race condition in executors which decommission quickly.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Existing core tests.
    
    Closes #36434 from holdenk/SPARK-38969-decom-reporting.
    
    Authored-by: Holden Karau <hol...@pigscanfly.ca>
    Signed-off-by: Holden Karau <hka...@netflix.com>
---
 .../apache/spark/scheduler/dynalloc/ExecutorMonitor.scala | 10 ++++------
 .../kubernetes/docker/src/main/dockerfiles/spark/decom.sh | 15 ++++++++++-----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala 
b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala
index defef5bfcf2..9132d0e46d3 100644
--- 
a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala
+++ 
b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala
@@ -355,13 +355,11 @@ private[spark] class ExecutorMonitor(
     val removed = executors.remove(event.executorId)
     if (removed != null) {
       decrementExecResourceProfileCount(removed.resourceProfileId)
-      if (removed.decommissioning) {
-        if (event.reason == ExecutorLossMessage.decommissionFinished ||
-            event.reason == ExecutorDecommission().message) {
-          metrics.gracefullyDecommissioned.inc()
-        } else {
+      if (event.reason == ExecutorLossMessage.decommissionFinished ||
+        event.reason == ExecutorDecommission().message) {
+        metrics.gracefullyDecommissioned.inc()
+      } else if (removed.decommissioning) {
           metrics.decommissionUnfinished.inc()
-        }
       } else if (removed.pendingRemoval) {
         metrics.driverKilled.inc()
       } else {
diff --git 
a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh 
b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh
index cd973df257f..4c25b42e964 100755
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/decom.sh
@@ -18,17 +18,22 @@
 #
 
 
-set -ex
+set +e
+set -x
 echo "Asked to decommission"
 # Find the pid to signal
 date | tee -a ${LOG}
-WORKER_PID=$(ps -o pid -C java | tail -n 1| awk '{ sub(/^[ \t]+/, ""); print 
}')
+WORKER_PID=$(ps -o pid,cmd -C java |grep Executor \
+              | tail -n 1| awk '{ sub(/^[ \t]+/, ""); print }' \
+              | cut -f 1 -d " ")
 echo "Using worker pid $WORKER_PID"
 kill -s SIGPWR ${WORKER_PID}
-# For now we expect this to timeout, since we don't start exiting the backend.
+# If the worker does exit stop blocking K8s cleanup. Note this is a "soft"
+# block since the pod it's self will have a maximum decommissioning time which 
will
+# overload this.
 echo "Waiting for worker pid to exit"
-# If the worker does exit stop blocking the cleanup.
-timeout 60 tail --pid=${WORKER_PID} -f /dev/null
+tail --pid=${WORKER_PID} -f /dev/null
+sleep 1
 date
 echo "Done"
 date


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to