Thanks for the note.
The root cause is the following
Caused by: org.apache.flink.util.FlinkRuntimeException: Failed to start the
operator coordinators
at
org.apache.flink.runtime.scheduler.DefaultOperatorCoordinatorHandler.startOperatorCoordinators(DefaultOperatorCoordinatorHandler.java:169)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.scheduler.DefaultOperatorCoordinatorHandler.startAllOperatorCoordinators(DefaultOperatorCoordinatorHandler.java:82)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.scheduler.SchedulerBase.startScheduling(SchedulerBase.java:624)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.jobmaster.JobMaster.startScheduling(JobMaster.java:1010)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.jobmaster.JobMaster.startJobExecution(JobMaster.java:927)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.jobmaster.JobMaster.onStart(JobMaster.java:388)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.rpc.RpcEndpoint.internalCallOnStart(RpcEndpoint.java:181)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StoppedState.lambda$start$0(AkkaRpcActor.java:612)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StoppedState.start(AkkaRpcActor.java:611)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleControlMessage(AkkaRpcActor.java:185)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) ~[?:?]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) ~[?:?]
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
~[flink-scala_2.12-1.15.0.jar:1.15.0]
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
~[flink-scala_2.12-1.15.0.jar:1.15.0]
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
~[?:?]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
~[flink-scala_2.12-1.15.0.jar:1.15.0]
... 13 more
Caused by: java.lang.RuntimeException: java.net.URISyntaxException: Relative
path in absolute URI: file:~/usr/bin/hudi/tables/t1/.hoodie
at
org.apache.hudi.common.fs.HoodieWrapperFileSystem.convertPathWithScheme(HoodieWrapperFileSystem.java:156)
~[?:?]
at
org.apache.hudi.common.fs.HoodieWrapperFileSystem.convertToDefaultPath(HoodieWrapperFileSystem.java:961)
~[?:?]
at
org.apache.hudi.common.fs.HoodieWrapperFileSystem.lambda$getFileStatus$17(HoodieWrapperFileSystem.java:398)
~[?:?]
at
org.apache.hudi.common.fs.HoodieWrapperFileSystem.executeFuncWithTimeMetrics(HoodieWrapperFileSystem.java:106)
~[?:?]
at
org.apache.hudi.common.fs.HoodieWrapperFileSystem.getFileStatus(HoodieWrapperFileSystem.java:396)
~[?:?]
at
org.apache.hudi.exception.TableNotFoundException.checkTableValidity(TableNotFoundException.java:51)
~[?:?]
at
org.apache.hudi.common.table.HoodieTableMetaClient.<init>(HoodieTableMetaClient.java:128)
~[?:?]
at
org.apache.hudi.common.table.HoodieTableMetaClient.newMetaClient(HoodieTableMetaClient.java:642)
~[?:?]
at
org.apache.hudi.common.table.HoodieTableMetaClient.access$000(HoodieTableMetaClient.java:80)
~[?:?]
at
org.apache.hudi.common.table.HoodieTableMetaClient$Builder.build(HoodieTableMetaClient.java:711)
~[?:?]
at
org.apache.hudi.common.table.HoodieTableMetaClient.initTableAndGetMetaClient(HoodieTableMetaClient.java:466)
~[?:?]
at
org.apache.hudi.common.table.HoodieTableMetaClient$PropertyBuilder.initTable(HoodieTableMetaClient.java:1122)
~[?:?]
at
org.apache.hudi.util.StreamerUtil.initTableIfNotExists(StreamerUtil.java:323)
~[?:?]
at
org.apache.hudi.util.StreamerUtil.initTableIfNotExists(StreamerUtil.java:293)
~[?:?]
at
org.apache.hudi.sink.StreamWriteOperatorCoordinator.start(StreamWriteOperatorCoordinator.java:179)
~[?:?]
at
org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.start(OperatorCoordinatorHolder.java:194)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.scheduler.DefaultOperatorCoordinatorHandler.startOperatorCoordinators(DefaultOperatorCoordinatorHandler.java:164)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.scheduler.DefaultOperatorCoordinatorHandler.startAllOperatorCoordinators(DefaultOperatorCoordinatorHandler.java:82)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.scheduler.SchedulerBase.startScheduling(SchedulerBase.java:624)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.jobmaster.JobMaster.startScheduling(JobMaster.java:1010)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.jobmaster.JobMaster.startJobExecution(JobMaster.java:927)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.jobmaster.JobMaster.onStart(JobMaster.java:388)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.rpc.RpcEndpoint.internalCallOnStart(RpcEndpoint.java:181)
~[flink-dist-1.15.0.jar:1.15.0]
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StoppedState.lambda$start$0(AkkaRpcActor.java:612)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StoppedState.start(AkkaRpcActor.java:611)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleControlMessage(AkkaRpcActor.java:185)
~[flink-rpc-akka_db70a2fa-991e-4392-9447-5d060aeb156e.jar:1.15.0]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) ~[?:?]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) ~[?:?]
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
~[flink-scala_2.12-1.15.0.jar:1.15.0]
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
~[flink-scala_2.12-1.15.0.jar:1.15.0]
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
~[?:?]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
~[flink-scala_2.12-1.15.0.jar:1.15.0]
I’m not sure whether it’s proper to kill the cluster just because of using a
wrong job configuration (set a relative path).
> 2022年10月14日 19:53,Matthias Pohl via user <[email protected]> 写道:
>
> Hi Jie Han,
> welcome to the community. Just a little side note: These kinds of questions
> are more suitable to be asked in the user mailing list. The dev mailing list
> is rather used for discussing feature development or project-related topics.
> See [1] for further details.
>
> About your question: The stacktrace you're providing indicates that something
> went wrong while initiating the job execution. Unfortunately, the actual
> reason is not clear because that's not included in your stacktrace (it should
> be listed as a cause for the JobMasterException in your logs). You're right
> in assuming that Flink is able to handle certain kinds of user code and
> infrastructure-related errors by restarting the job. But there might be other
> Flink cluster internal errors that could cause a Flink cluster shutdown. It's
> hard to tell from the logs you provided. Usually, it's a good habit to share
> a reasonable amount of logs to make investigating the issue easier right away.
>
> Let's move the discussion into the user mailing list in case you have further
> questions.
>
> Best,
> Matthias
>
> [1] https://flink.apache.org/community.html#mailing-lists
> <https://flink.apache.org/community.html#mailing-lists>
> On Fri, Oct 14, 2022 at 10:13 AM Jie Han <[email protected]
> <mailto:[email protected]>> wrote:
> Hi, guys, I’m new to apache flink. It’s exciting to join the community!
>
> When I experienced flink 1.15.0, I met some problems confusing, here is the
> streamlined log:
>
> org.apache.flink.runtime.rpc.akka.exceptions.AkkaRpcException: Could not
> start RpcEndpoint jobmanager_2.
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StoppedState.start(AkkaRpcActor.java:617)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleControlMessage(AkkaRpcActor.java:185)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.japi.pf
> <http://akka.japi.pf/>.UnitCaseStatement.apply(CaseStatements.scala:24)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.japi.pf
> <http://akka.japi.pf/>.UnitCaseStatement.apply(CaseStatements.scala:20)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.japi.pf
> <http://akka.japi.pf/>.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.actor.Actor.aroundReceive(Actor.scala:537)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.actor.Actor.aroundReceive$(Actor.scala:535)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.actor.ActorCell.invoke(ActorCell.scala:548)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
> [flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.dispatch.Mailbox.run(Mailbox.scala:231)
> [flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
> [flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
> [?:1.8.0_301]
> at
> java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067)
> [?:1.8.0_301]
> at
> java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703)
> [?:1.8.0_301]
> at
> java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172)
> [?:1.8.0_301]
> Caused by: org.apache.flink.runtime.jobmaster.JobMasterException: Could not
> start the JobMaster.
> at
> org.apache.flink.runtime.jobmaster.JobMaster.onStart(JobMaster.java:390)
> ~[flink-dist-1.15.0.jar:1.15.0]
> at
> org.apache.flink.runtime.rpc.RpcEndpoint.internalCallOnStart(RpcEndpoint.java:181)
> ~[flink-dist-1.15.0.jar:1.15.0]
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StoppedState.lambda$start$0(AkkaRpcActor.java:612)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StoppedState.start(AkkaRpcActor.java:611)
> ~[flink-rpc-akka_65043be6-9dc5-4303-a760-61bd044fb53a.jar:1.15.0]
> ... 20 more
> …
>
> 2022-10-14 15:13:30,493 INFO
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint [] - Shutting
> StandaloneSessionClusterEntrypoint down with application status UNKNOWN.
> Diagnostics Cluster entrypoint has been closed externally..
>
> As recorded in the log, the standalone session cluster was shut down by the
> jobmaster exception. I thought any job’s exception should not shut down the
> cluster.
> So, is this action expected?