[jira] [Commented] (FLINK-35485) JobMaster failed with "the job xx has not been finished"
[ https://issues.apache.org/jira/browse/FLINK-35485?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17854891#comment-17854891 ] Xingcan Cui commented on FLINK-35485: - Hi [~mapohl], I hit the exception again today and it caused the jobmanager to restart. Still no WARN+ logs. I checked the corresponding job and it actually finished succefully. > JobMaster failed with "the job xx has not been finished" > > > Key: FLINK-35485 > URL: https://issues.apache.org/jira/browse/FLINK-35485 > Project: Flink > Issue Type: Bug > Components: Runtime / Coordination >Affects Versions: 1.18.1 >Reporter: Xingcan Cui >Priority: Major > > We ran a session cluster on K8s and used Flink SQL gateway to submit queries. > Hit the following rare exception once which caused the job manager to restart. > {code:java} > org.apache.flink.util.FlinkException: JobMaster for job > 50d681ae1e8170f77b4341dda6aba9bc failed. > at > org.apache.flink.runtime.dispatcher.Dispatcher.jobMasterFailed(Dispatcher.java:1454) > at > org.apache.flink.runtime.dispatcher.Dispatcher.jobManagerRunnerFailed(Dispatcher.java:776) > at > org.apache.flink.runtime.dispatcher.Dispatcher.lambda$runJob$6(Dispatcher.java:698) > at java.base/java.util.concurrent.CompletableFuture.uniHandle(Unknown > Source) > at > java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(Unknown > Source) > at java.base/java.util.concurrent.CompletableFuture$Completion.run(Unknown > Source) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451) > at > org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218) > at > org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168) > at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) > at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) > at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) > at > org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) > at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) > at > org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) > at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) > at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) > at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) > at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) > at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) > at java.base/java.util.concurrent.ForkJoinTask.doExec(Unknown Source) > at > java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown > Source) > at java.base/java.util.concurrent.ForkJoinPool.scan(Unknown Source) > at java.base/java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) > at java.base/java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) > Caused by: org.apache.flink.runtime.jobmaster.JobNotFinishedException: The > job (50d681ae1e8170f77b4341dda6aba9bc) has not been finished. > at > org.apache.flink.runtime.jobmaster.DefaultJobMasterServiceProcess.closeAsync(DefaultJobMasterServiceProcess.java:157) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcess(JobMasterServiceLeadershipRunner.java:431) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.callIfRunning(JobMasterServiceLeadershipRunner.java:476) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$stopJobMasterServiceProcessAsync$12(JobMasterServiceLeadershipRunner.java:407) > at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(Unknown > Source) > at java.base/java.util.concurrent.CompletableFuture.thenCompose(Unknown > Source) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:405) > at > org.apache.flink.runtime.jobm
[jira] [Commented] (FLINK-35485) JobMaster failed with "the job xx has not been finished"
[ https://issues.apache.org/jira/browse/FLINK-35485?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17852267#comment-17852267 ] Xingcan Cui commented on FLINK-35485: - Hi [~mapohl], unfortunately, I failed to fetch more logs from the cluster. If it happens again, I'll try to get some logs and post them here. Thanks! > JobMaster failed with "the job xx has not been finished" > > > Key: FLINK-35485 > URL: https://issues.apache.org/jira/browse/FLINK-35485 > Project: Flink > Issue Type: Bug > Components: Runtime / Coordination >Affects Versions: 1.18.1 >Reporter: Xingcan Cui >Priority: Major > > We ran a session cluster on K8s and used Flink SQL gateway to submit queries. > Hit the following rare exception once which caused the job manager to restart. > {code:java} > org.apache.flink.util.FlinkException: JobMaster for job > 50d681ae1e8170f77b4341dda6aba9bc failed. > at > org.apache.flink.runtime.dispatcher.Dispatcher.jobMasterFailed(Dispatcher.java:1454) > at > org.apache.flink.runtime.dispatcher.Dispatcher.jobManagerRunnerFailed(Dispatcher.java:776) > at > org.apache.flink.runtime.dispatcher.Dispatcher.lambda$runJob$6(Dispatcher.java:698) > at java.base/java.util.concurrent.CompletableFuture.uniHandle(Unknown > Source) > at > java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(Unknown > Source) > at java.base/java.util.concurrent.CompletableFuture$Completion.run(Unknown > Source) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451) > at > org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218) > at > org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168) > at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) > at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) > at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) > at > org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) > at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) > at > org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) > at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) > at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) > at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) > at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) > at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) > at java.base/java.util.concurrent.ForkJoinTask.doExec(Unknown Source) > at > java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown > Source) > at java.base/java.util.concurrent.ForkJoinPool.scan(Unknown Source) > at java.base/java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) > at java.base/java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) > Caused by: org.apache.flink.runtime.jobmaster.JobNotFinishedException: The > job (50d681ae1e8170f77b4341dda6aba9bc) has not been finished. > at > org.apache.flink.runtime.jobmaster.DefaultJobMasterServiceProcess.closeAsync(DefaultJobMasterServiceProcess.java:157) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcess(JobMasterServiceLeadershipRunner.java:431) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.callIfRunning(JobMasterServiceLeadershipRunner.java:476) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$stopJobMasterServiceProcessAsync$12(JobMasterServiceLeadershipRunner.java:407) > at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(Unknown > Source) > at java.base/java.util.concurrent.CompletableFuture.thenCompose(Unknown > Source) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:405) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeader
[jira] [Commented] (FLINK-35485) JobMaster failed with "the job xx has not been finished"
[ https://issues.apache.org/jira/browse/FLINK-35485?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17850600#comment-17850600 ] Matthias Pohl commented on FLINK-35485: --- Hi [~xccui], thanks for reporting the issue. Can you provide the JobManager logs for this case? That would help getting a better understanding of what was going on. > JobMaster failed with "the job xx has not been finished" > > > Key: FLINK-35485 > URL: https://issues.apache.org/jira/browse/FLINK-35485 > Project: Flink > Issue Type: Bug > Components: Runtime / Coordination >Affects Versions: 1.18.1 >Reporter: Xingcan Cui >Priority: Major > > We ran a session cluster on K8s and used Flink SQL gateway to submit queries. > Hit the following rare exception once which caused the job manager to restart. > {code:java} > org.apache.flink.util.FlinkException: JobMaster for job > 50d681ae1e8170f77b4341dda6aba9bc failed. > at > org.apache.flink.runtime.dispatcher.Dispatcher.jobMasterFailed(Dispatcher.java:1454) > at > org.apache.flink.runtime.dispatcher.Dispatcher.jobManagerRunnerFailed(Dispatcher.java:776) > at > org.apache.flink.runtime.dispatcher.Dispatcher.lambda$runJob$6(Dispatcher.java:698) > at java.base/java.util.concurrent.CompletableFuture.uniHandle(Unknown > Source) > at > java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(Unknown > Source) > at java.base/java.util.concurrent.CompletableFuture$Completion.run(Unknown > Source) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451) > at > org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218) > at > org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85) > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168) > at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) > at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) > at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) > at > org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) > at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) > at > org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) > at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) > at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) > at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) > at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) > at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) > at java.base/java.util.concurrent.ForkJoinTask.doExec(Unknown Source) > at > java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown > Source) > at java.base/java.util.concurrent.ForkJoinPool.scan(Unknown Source) > at java.base/java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) > at java.base/java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) > Caused by: org.apache.flink.runtime.jobmaster.JobNotFinishedException: The > job (50d681ae1e8170f77b4341dda6aba9bc) has not been finished. > at > org.apache.flink.runtime.jobmaster.DefaultJobMasterServiceProcess.closeAsync(DefaultJobMasterServiceProcess.java:157) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcess(JobMasterServiceLeadershipRunner.java:431) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.callIfRunning(JobMasterServiceLeadershipRunner.java:476) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$stopJobMasterServiceProcessAsync$12(JobMasterServiceLeadershipRunner.java:407) > at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(Unknown > Source) > at java.base/java.util.concurrent.CompletableFuture.thenCompose(Unknown > Source) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:405) > at > org.apache.flink.runtime.jobmaster.Jo