[ https://issues.apache.org/jira/browse/SPARK-35304?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17576866#comment-17576866 ]
Emilie Lin commented on SPARK-35304: ------------------------------------ Hi [~ocworld] do you have any updates for this issue? > [k8s] Though finishing a job, the driver pod is running infinitely > ------------------------------------------------------------------ > > Key: SPARK-35304 > URL: https://issues.apache.org/jira/browse/SPARK-35304 > Project: Spark > Issue Type: Bug > Components: Kubernetes > Affects Versions: 3.0.1, 3.0.2, 3.1.1 > Reporter: Keunhyun Oh > Priority: Major > > Though finishing a job, the driver pod is running infinitely. > Executors are terminated. However, the driver status is not changed to > succeeded. > It is not experienced in spark 2 on k8s. > It is only appeared on spark 3. > > my jvm dump is that > {code:java} > 2021-05-04 15:11:37 > Full thread dump OpenJDK 64-Bit Server VM (25.252-b09 mixed mode): > "Attach Listener" #182 daemon prio=9 os_prio=0 tid=0x00007f02bc001000 > nid=0x106 waiting on condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > Locked ownable synchronizers: > - None > "DestroyJavaVM" #179 prio=5 os_prio=0 tid=0x00007f0fe0017000 nid=0x35 waiting > on condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > Locked ownable synchronizers: > - None > "s3a-transfer-unbounded-pool2-t1" #172 daemon prio=5 os_prio=0 > tid=0x00007f025d98d000 nid=0xe5 waiting on condition [0x00007f01f86f3000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007f0353681b38> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Locked ownable synchronizers: > - None > "java-sdk-progress-listener-callback-thread" #169 daemon prio=5 os_prio=0 > tid=0x00007f002000f000 nid=0xe2 waiting on condition [0x00007f004f7f6000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007f0bdb1ba7c0> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Locked ownable synchronizers: > - None > "pool-26-thread-1" #72 prio=5 os_prio=0 tid=0x00007f025c829000 nid=0x80 > waiting on condition [0x00007f01ba931000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007f0bfdeaa8f0> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Locked ownable synchronizers: > - None > "java-sdk-http-connection-reaper" #56 daemon prio=5 os_prio=0 > tid=0x00007f025d818000 nid=0x6e waiting on condition [0x00007f01fb9fe000] > java.lang.Thread.State: TIMED_WAITING (sleeping) > at java.lang.Thread.sleep(Native Method) > at > com.amazonaws.http.IdleConnectionReaper.run(IdleConnectionReaper.java:188) > Locked ownable synchronizers: > - None > "Timer for 's3a-file-system' metrics system" #55 daemon prio=5 os_prio=0 > tid=0x00007f0fe19e6800 nid=0x6d in Object.wait() [0x00007f029c1d8000] > java.lang.Thread.State: TIMED_WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.util.TimerThread.mainLoop(Timer.java:552) > - locked <0x00007f0353383bd0> (a java.util.TaskQueue) > at java.util.TimerThread.run(Timer.java:505) > Locked ownable synchronizers: > - None > "MutableQuantiles-0" #54 daemon prio=5 os_prio=0 tid=0x00007f025d78b800 > nid=0x6c runnable [0x00007f029c2d9000] > java.lang.Thread.State: TIMED_WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00007f0351a09dd8> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at > java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) > at > java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1093) > at > java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1074) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Locked ownable synchronizers: > - None > "org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner" > #13 daemon prio=5 os_prio=0 tid=0x00007f0fe1182000 nid=0x45 in Object.wait() > [0x00007f02c50d7000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > - waiting on <0x00007f0350ea4390> (a java.lang.ref.ReferenceQueue$Lock) > at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:144) > - locked <0x00007f0350ea4390> (a java.lang.ref.ReferenceQueue$Lock) > at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:165) > at > org.apache.hadoop.fs.FileSystem$Statistics$StatisticsDataReferenceCleaner.run(FileSystem.java:3839) > at java.lang.Thread.run(Thread.java:748) > Locked ownable synchronizers: > - None > "Service Thread" #7 daemon prio=9 os_prio=0 tid=0x00007f0fe00e2000 nid=0x3f > runnable [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > Locked ownable synchronizers: > - None > "C1 CompilerThread1" #6 daemon prio=9 os_prio=0 tid=0x00007f0fe00c7800 > nid=0x3e waiting on condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > Locked ownable synchronizers: > - None > "C2 CompilerThread0" #5 daemon prio=9 os_prio=0 tid=0x00007f0fe00c4800 > nid=0x3d waiting on condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > Locked ownable synchronizers: > - None > "Signal Dispatcher" #4 daemon prio=9 os_prio=0 tid=0x00007f0fe00c2800 > nid=0x3c runnable [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > Locked ownable synchronizers: > - None > "Finalizer" #3 daemon prio=8 os_prio=0 tid=0x00007f0fe0090000 nid=0x3b in > Object.wait() [0x00007f033fffe000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:144) > - locked <0x00007f03504176d8> (a java.lang.ref.ReferenceQueue$Lock) > at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:165) > at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:216) > Locked ownable synchronizers: > - None > "Reference Handler" #2 daemon prio=10 os_prio=0 tid=0x00007f0fe008b800 > nid=0x3a in Object.wait() [0x00007f034416a000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.lang.Object.wait(Object.java:502) > at java.lang.ref.Reference.tryHandlePending(Reference.java:191) > - locked <0x00007f0350424c20> (a java.lang.ref.Reference$Lock) > at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:153) > Locked ownable synchronizers: > - None > "VM Thread" os_prio=0 tid=0x00007f0fe0082000 nid=0x39 runnable > "GC task thread#0 (ParallelGC)" os_prio=0 tid=0x00007f0fe002c000 nid=0x36 > runnable > "GC task thread#1 (ParallelGC)" os_prio=0 tid=0x00007f0fe002d800 nid=0x37 > runnable > "GC task thread#2 (ParallelGC)" os_prio=0 tid=0x00007f0fe002f800 nid=0x38 > runnable > "VM Periodic Task Thread" os_prio=0 tid=0x00007f0fe00ec800 nid=0x40 waiting > on condition > JNI global references: 6244 > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org