[
https://issues.apache.org/jira/browse/SPARK-56593?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
zuotingbing updated SPARK-56593:
--------------------------------
Description:
when i set spark.kubernetes.allocation.driver.readinessTimeout=600s , the
driver pod is stuck for 10 minutes which is unreasonable.
My test involved starting a ThriftServer service within the driver pod, with
port number 18000. The drvier pod's readinessProbe is `tcpSocket: port: 18000`.
*stack error info:*
2026-04-22 22:09:52,747 ERROR Utils: Uncaught exception in thread main
io.fabric8.kubernetes.client.KubernetesClientTimeoutException: Timed out
waiting for [600000] milliseconds for [Pod] with
name:[default-sparksql-driver-0] in namespace [zenap].
at
io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilCondition(BaseOperation.java:889)
at
io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilReady(BaseOperation.java:871)
at
io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilReady(BaseOperation.java:92)
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$2(ExecutorPodsAllocator.scala:140)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1382)
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$1(ExecutorPodsAllocator.scala:140)
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$1$adapted(ExecutorPodsAllocator.scala:132)
at scala.Option.foreach(Option.scala:437) ~[scala-library-2.13.16.jar:?]
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.start(ExecutorPodsAllocator.scala:132)
at
org.apache.spark.scheduler.cluster.k8s.KubernetesClusterSchedulerBackend.start(KubernetesClusterSchedulerBackend.scala:107)
at
org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:238)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:604)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2888)
at
org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$2(SparkSession.scala:1150)
at scala.Option.getOrElse(Option.scala:201)
[scala-library-2.13.16.jar:?]
at
org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:1144)
at
org.apache.spark.sql.hive.thriftserver.SparkSQLEnv$.init(SparkSQLEnv.scala:64)
at
org.apache.spark.sql.hive.thriftserver.HiveThriftServer2$.main(HiveThriftServer2.scala:116)
at
org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(HiveThriftServer2.scala)
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
~[?:?]
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
~[?:?]
at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown
Source) ~[?:?]
at java.lang.reflect.Method.invoke(Unknown Source) ~[?:?]
at
org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at
org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1045)
at
org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:199)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:222)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
at
org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1136)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1145)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
was:
when i set spark.kubernetes.allocation.driver.readinessTimeout=600s , the
driver pod is stuck for 10 minutes which is unreasonable.
My test involved starting a ThriftServer service within the driver pod, with
port number 18000. The drvier pod's readinessProbe is `tcpSocket: port: 18000`.
stack error info:
2026-04-22 22:09:52,747 ERROR Utils: Uncaught exception in thread main
io.fabric8.kubernetes.client.KubernetesClientTimeoutException: Timed out
waiting for [600000] milliseconds for [Pod] with
name:[default-sparksql-driver-0] in namespace [zenap].
at
io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilCondition(BaseOperation.java:889)
at
io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilReady(BaseOperation.java:871)
at
io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilReady(BaseOperation.java:92)
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$2(ExecutorPodsAllocator.scala:140)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1382)
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$1(ExecutorPodsAllocator.scala:140)
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$1$adapted(ExecutorPodsAllocator.scala:132)
at scala.Option.foreach(Option.scala:437) ~[scala-library-2.13.16.jar:?]
at
org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.start(ExecutorPodsAllocator.scala:132)
at
org.apache.spark.scheduler.cluster.k8s.KubernetesClusterSchedulerBackend.start(KubernetesClusterSchedulerBackend.scala:107)
at
org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:238)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:604)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2888)
at
org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$2(SparkSession.scala:1150)
at scala.Option.getOrElse(Option.scala:201)
[scala-library-2.13.16.jar:?]
at
org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:1144)
at
org.apache.spark.sql.hive.thriftserver.SparkSQLEnv$.init(SparkSQLEnv.scala:64)
at
org.apache.spark.sql.hive.thriftserver.HiveThriftServer2$.main(HiveThriftServer2.scala:116)
at
org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(HiveThriftServer2.scala)
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
~[?:?]
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
~[?:?]
at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown
Source) ~[?:?]
at java.lang.reflect.Method.invoke(Unknown Source) ~[?:?]
at
org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at
org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1045)
at
org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:199)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:222)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
at
org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1136)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1145)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> [k8s] [hive-thriftserver] when i set
> spark.kubernetes.allocation.driver.readinessTimeout=600s , the driver pod is
> stuck for 10 minutes which is unreasonable.
> -------------------------------------------------------------------------------------------------------------------------------------------------------------
>
> Key: SPARK-56593
> URL: https://issues.apache.org/jira/browse/SPARK-56593
> Project: Spark
> Issue Type: Bug
> Components: Kubernetes, SQL
> Affects Versions: 4.1.1
> Reporter: zuotingbing
> Priority: Major
> Attachments: image-2026-04-23-16-22-43-146.png
>
>
> when i set spark.kubernetes.allocation.driver.readinessTimeout=600s , the
> driver pod is stuck for 10 minutes which is unreasonable.
> My test involved starting a ThriftServer service within the driver pod, with
> port number 18000. The drvier pod's readinessProbe is `tcpSocket: port:
> 18000`.
>
> *stack error info:*
> 2026-04-22 22:09:52,747 ERROR Utils: Uncaught exception in thread main
> io.fabric8.kubernetes.client.KubernetesClientTimeoutException: Timed out
> waiting for [600000] milliseconds for [Pod] with
> name:[default-sparksql-driver-0] in namespace [zenap].
> at
> io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilCondition(BaseOperation.java:889)
>
> at
> io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilReady(BaseOperation.java:871)
>
> at
> io.fabric8.kubernetes.client.dsl.internal.BaseOperation.waitUntilReady(BaseOperation.java:92)
>
> at
> org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$2(ExecutorPodsAllocator.scala:140)
>
> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1382)
> at
> org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$1(ExecutorPodsAllocator.scala:140)
>
> at
> org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$1$adapted(ExecutorPodsAllocator.scala:132)
>
> at scala.Option.foreach(Option.scala:437)
> ~[scala-library-2.13.16.jar:?]
> at
> org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.start(ExecutorPodsAllocator.scala:132)
> at
> org.apache.spark.scheduler.cluster.k8s.KubernetesClusterSchedulerBackend.start(KubernetesClusterSchedulerBackend.scala:107)
>
> at
> org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:238)
> at org.apache.spark.SparkContext.<init>(SparkContext.scala:604)
> at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2888)
> at
> org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$2(SparkSession.scala:1150)
> at scala.Option.getOrElse(Option.scala:201)
> [scala-library-2.13.16.jar:?]
> at
> org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:1144)
>
> at
> org.apache.spark.sql.hive.thriftserver.SparkSQLEnv$.init(SparkSQLEnv.scala:64)
>
> at
> org.apache.spark.sql.hive.thriftserver.HiveThriftServer2$.main(HiveThriftServer2.scala:116)
>
> at
> org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(HiveThriftServer2.scala)
>
> at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native
> Method) ~[?:?]
> at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown
> Source) ~[?:?]
> at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown
> Source) ~[?:?]
> at java.lang.reflect.Method.invoke(Unknown Source) ~[?:?]
> at
> org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
> at
> org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1045)
> at
> org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:199)
> at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:222)
> at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
> at
> org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1136)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1145)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]