Hi, 可以排查下 GC 情况,频繁 FGC 也会导致这些情况。
Best, jjiey > 2021年3月8日 14:37,yidan zhao <hinobl...@gmail.com> 写道: > > 如题,我有个任务频繁发生该异常然后重启。今天任务启动1h后,看了下WEB-UI的检查点也没,restored达到了8已经。然后Exception页面显示该错误,估计大多数都是因为该错误导致的restore。 > 除此外,就是 ‘Job leader for job id eb5d2893c4c6f4034995b9c8e180f01e lost > leadership’ 错导致任务重启。 > > 下面给出刚刚的一个错误日志(环境flink1.12,standalone集群,5JM+5TM,JM和TM混部在相同机器): > 2021-03-08 14:31:40 > org.apache.flink.runtime.io.network.netty.exception.RemoteTransportException: > Error at remote task manager '10.35.185.38/10.35.185.38:2016'. > at org.apache.flink.runtime.io.network.netty. > CreditBasedPartitionRequestClientHandler.decodeMsg( > CreditBasedPartitionRequestClientHandler.java:294) > at org.apache.flink.runtime.io.network.netty. > CreditBasedPartitionRequestClientHandler.channelRead( > CreditBasedPartitionRequestClientHandler.java:183) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:379) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:365) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext > .java:357) > at org.apache.flink.runtime.io.network.netty. > NettyMessageClientDecoderDelegate.channelRead( > NettyMessageClientDecoderDelegate.java:115) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:379) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:365) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext > .java:357) > at org.apache.flink.shaded.netty4.io.netty.channel. > DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java: > 1410) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:379) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:365) > at org.apache.flink.shaded.netty4.io.netty.channel. > DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) > at org.apache.flink.shaded.netty4.io.netty.channel.epoll. > AbstractEpollStreamChannel$EpollStreamUnsafe.epollInReady( > AbstractEpollStreamChannel.java:792) > at org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop > .processReady(EpollEventLoop.java:475) > at org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop > .run(EpollEventLoop.java:378) > at org.apache.flink.shaded.netty4.io.netty.util.concurrent. > SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) > at org.apache.flink.shaded.netty4.io.netty.util.internal. > ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > at java.lang.Thread.run(Thread.java:748) > Caused by: org.apache.flink.runtime.io.network.partition. > ProducerFailedException: org.apache.flink.util.FlinkException: JobManager > responsible for eb5d2893c4c6f4034995b9c8e180f01e lost the leadership. > at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue > .writeAndFlushNextMessageIfPossible(PartitionRequestQueue.java:221) > at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue > .enqueueAvailableReader(PartitionRequestQueue.java:108) > at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue > .userEventTriggered(PartitionRequestQueue.java:170) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeUserEventTriggered( > AbstractChannelHandlerContext.java:346) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeUserEventTriggered( > AbstractChannelHandlerContext.java:332) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.fireUserEventTriggered( > AbstractChannelHandlerContext.java:324) > at org.apache.flink.shaded.netty4.io.netty.channel. > ChannelInboundHandlerAdapter.userEventTriggered(ChannelInboundHandlerAdapter > .java:117) > at org.apache.flink.shaded.netty4.io.netty.handler.codec. > ByteToMessageDecoder.userEventTriggered(ByteToMessageDecoder.java:365) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeUserEventTriggered( > AbstractChannelHandlerContext.java:346) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeUserEventTriggered( > AbstractChannelHandlerContext.java:332) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.fireUserEventTriggered( > AbstractChannelHandlerContext.java:324) > at org.apache.flink.shaded.netty4.io.netty.channel. > DefaultChannelPipeline$HeadContext.userEventTriggered(DefaultChannelPipeline > .java:1428) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeUserEventTriggered( > AbstractChannelHandlerContext.java:346) > at org.apache.flink.shaded.netty4.io.netty.channel. > AbstractChannelHandlerContext.invokeUserEventTriggered( > AbstractChannelHandlerContext.java:332) > at org.apache.flink.shaded.netty4.io.netty.channel. > DefaultChannelPipeline.fireUserEventTriggered(DefaultChannelPipeline.java: > 913) > at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue > .lambda$notifyReaderNonEmpty$0(PartitionRequestQueue.java:87) > at org.apache.flink.shaded.netty4.io.netty.util.concurrent. > AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164) > at org.apache.flink.shaded.netty4.io.netty.util.concurrent. > SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472) > at org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop > .run(EpollEventLoop.java:387) > ... 3 more > Caused by: org.apache.flink.util.FlinkException: JobManager responsible for > eb5d2893c4c6f4034995b9c8e180f01e lost the leadership. > at org.apache.flink.runtime.taskexecutor.TaskExecutor > .disconnectJobManagerConnection(TaskExecutor.java:1422) > at org.apache.flink.runtime.taskexecutor.TaskExecutor.access$1300( > TaskExecutor.java:174) > at org.apache.flink.runtime.taskexecutor. > TaskExecutor$JobLeaderListenerImpl.lambda$null$2(TaskExecutor.java:1856) > at java.util.Optional.ifPresent(Optional.java:159) > at org.apache.flink.runtime.taskexecutor. > TaskExecutor$JobLeaderListenerImpl.lambda$jobManagerLostLeadership$3( > TaskExecutor.java:1855) > at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync( > AkkaRpcActor.java:404) > at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage( > AkkaRpcActor.java:197) > at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage( > AkkaRpcActor.java:154) > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123) > at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > at akka.actor.Actor$class.aroundReceive(Actor.scala:517) > at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > at akka.actor.ActorCell.invoke(ActorCell.scala:561) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > at akka.dispatch.Mailbox.run(Mailbox.scala:225) > at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool > .java:1339) > at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread > .java:107) > Caused by: java.lang.Exception: Job leader for job id > eb5d2893c4c6f4034995b9c8e180f01e lost leadership. > ... 24 more > > > (1)zookeeper的超时设置的是60s,感觉网络异常zk超时不至于60s都不够。 > (2)akka.ask.timeout: 60s > taskmanager.network.request-backoff.max: 60000 > akka此参数之前也调整为60s了。 > > 如上信息,希望社区同学们给点思路。 >