Github user dvogelbacher commented on a diff in the pull request: https://github.com/apache/spark/pull/21366#discussion_r192200162 --- Diff: resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleEventHandler.scala --- @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.scheduler.cluster.k8s + +import com.google.common.cache.{Cache, CacheBuilder} +import io.fabric8.kubernetes.api.model.Pod +import io.fabric8.kubernetes.client.KubernetesClient +import scala.collection.JavaConverters._ + +import org.apache.spark.SparkConf +import org.apache.spark.deploy.k8s.Config._ +import org.apache.spark.deploy.k8s.Constants._ +import org.apache.spark.scheduler.ExecutorExited +import org.apache.spark.util.Utils + +private[spark] class ExecutorPodsLifecycleEventHandler( + conf: SparkConf, + executorBuilder: KubernetesExecutorBuilder, + kubernetesClient: KubernetesClient, + podsEventQueue: ExecutorPodsEventQueue, + // Use a best-effort to track which executors have been removed already. It's not generally + // job-breaking if we remove executors more than once but it's ideal if we make an attempt + // to avoid doing so. Expire cache entries so that this data structure doesn't grow beyond + // bounds. + removedExecutorsCache: Cache[java.lang.Long, java.lang.Long]) { + + import ExecutorPodsLifecycleEventHandler._ + + private val eventProcessingInterval = conf.get(KUBERNETES_EXECUTOR_EVENT_PROCESSING_INTERVAL) + + def start(schedulerBackend: KubernetesClusterSchedulerBackend): Unit = { + podsEventQueue.addSubscriber(eventProcessingInterval) { updatedPods => + updatedPods.foreach { updatedPod => + processUpdatedPod(schedulerBackend, updatedPod) + } + } + } + + private def processUpdatedPod( + schedulerBackend: KubernetesClusterSchedulerBackend, updatedPod: Pod) = { + val execId = updatedPod.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL).toLong + if (isDeleted(updatedPod)) { + removeExecutorFromSpark(schedulerBackend, updatedPod, execId) + } else { + updatedPod.getStatus.getPhase.toLowerCase match { + // TODO (SPARK-24135) - handle more classes of errors + case "error" | "failed" | "succeeded" => --- End diff -- btw, I just tried testing a pod with an init error locally and doing `kubectl get -o=yaml pod {name} | grep phase` gives me `phase: Failed`. So I think the Init errors of `SPARK-24135` will be handled by this as well.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org