你搜索一下看看有没有too old resource version的报错
另外,测试一下Pod和APIServer的网络状态,是不是经常断

Best,
Yang

macdoor <macd...@gmail.com> 于2021年1月18日周一 上午9:45写道:

> 大约几十分钟就会restart,请教大佬们有查的思路,每次抛出的错误都是一样的,运行一段时间也会积累很多ConfigMap,下面是一个具体的错误
>
> 错误内容
>
> 2021-01-17 04:16:46,116 ERROR
> org.apache.flink.runtime.resourcemanager.StandaloneResourceManager [] -
> Fatal error occurred in ResourceManager.
> org.apache.flink.runtime.leaderretrieval.LeaderRetrievalException: Error
> while watching the ConfigMap
> test-flink-etl-42557c3f6325ffc876958430859178cd-jobmanager-leader
>         at
>
> org.apache.flink.kubernetes.highavailability.KubernetesLeaderRetrievalDriver$ConfigMapCallbackHandlerImpl.handleFatalError(KubernetesLeaderRetrievalDriver.java:120)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> org.apache.flink.kubernetes.kubeclient.resources.AbstractKubernetesWatcher.onClose(AbstractKubernetesWatcher.java:48)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.utils.WatcherToggle.onClose(WatcherToggle.java:56)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.closeEvent(WatchConnectionManager.java:367)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.access$700(WatchConnectionManager.java:50)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager$1.onMessage(WatchConnectionManager.java:259)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .RealWebSocket.onReadMessage(RealWebSocket.java:323)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .WebSocketReader.readMessageFrame(WebSocketReader.java:219)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .WebSocketReader.processNextFrame(WebSocketReader.java:105)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .RealWebSocket.loopReader(RealWebSocket.java:274)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .RealWebSocket$2.onResponse(RealWebSocket.java:214)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> org.apache.flink.kubernetes.shaded.okhttp3.RealCall$AsyncCall.execute(RealCall.java:206)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> org.apache.flink.kubernetes.shaded.okhttp3.internal.NamedRunnable.run(NamedRunnable.java:32)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> [?:1.8.0_275]
>         at
>
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> [?:1.8.0_275]
>         at java.lang.Thread.run(Thread.java:748) [?:1.8.0_275]
> 2021-01-17 04:16:46,117 ERROR
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint        [] - Fatal
> error occurred in the cluster entrypoint.
> org.apache.flink.runtime.leaderretrieval.LeaderRetrievalException: Error
> while watching the ConfigMap
> test-flink-etl-42557c3f6325ffc876958430859178cd-jobmanager-leader
>         at
>
> org.apache.flink.kubernetes.highavailability.KubernetesLeaderRetrievalDriver$ConfigMapCallbackHandlerImpl.handleFatalError(KubernetesLeaderRetrievalDriver.java:120)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> org.apache.flink.kubernetes.kubeclient.resources.AbstractKubernetesWatcher.onClose(AbstractKubernetesWatcher.java:48)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.utils.WatcherToggle.onClose(WatcherToggle.java:56)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.closeEvent(WatchConnectionManager.java:367)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.access$700(WatchConnectionManager.java:50)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager$1.onMessage(WatchConnectionManager.java:259)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .RealWebSocket.onReadMessage(RealWebSocket.java:323)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .WebSocketReader.readMessageFrame(WebSocketReader.java:219)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .WebSocketReader.processNextFrame(WebSocketReader.java:105)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .RealWebSocket.loopReader(RealWebSocket.java:274)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
> org.apache.flink.kubernetes.shaded.okhttp3.internal.ws
> .RealWebSocket$2.onResponse(RealWebSocket.java:214)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> org.apache.flink.kubernetes.shaded.okhttp3.RealCall$AsyncCall.execute(RealCall.java:206)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> org.apache.flink.kubernetes.shaded.okhttp3.internal.NamedRunnable.run(NamedRunnable.java:32)
> [flink-dist_2.11-1.12.1.jar:1.12.1]
>         at
>
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> [?:1.8.0_275]
>         at
>
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> [?:1.8.0_275]
>         at java.lang.Thread.run(Thread.java:748) [?:1.8.0_275]
> 2021-01-17 04:16:46,164 INFO  org.apache.flink.runtime.blob.BlobServer
>
> [] - Stopped BLOB server at 0.0.0.0:6124
>
> jobmanager重启后,查看有这个 ConfigMap
> test-flink-etl-42557c3f6325ffc876958430859178cd-jobmanager-leader
>
> [gum@docker-repos ~]$ kubectl -n gem-flink get cm
> test-flink-etl-42557c3f6325ffc876958430859178cd-jobmanager-leader -o yaml
> apiVersion: v1
> data:
>   address: akka.tcp://flink@flink-jobmanager:6123/user/rpc/jobmanager_3
>   sessionId: c0f99c65-af3c-4916-ae7c-c272e2987e31
> kind: ConfigMap
> metadata:
>   annotations:
>     control-plane.alpha.kubernetes.io/leader:
>
> '{"holderIdentity":"5fd98e66-8f6e-4871-b349-fd8760e9eb6b","leaseDuration":15.000000000,"acquireTime":"2021-01-17T03:43:12.444000Z","renewTime":"2021-01-17T03:51:52.460000Z","leaderTransitions":105}'
>   creationTimestamp: "2021-01-17T03:43:12Z"
>   labels:
>     app: test-flink-etl
>     configmap-type: high-availability
>     type: flink-native-kubernetes
>   name: test-flink-etl-42557c3f6325ffc876958430859178cd-jobmanager-leader
>   namespace: gem-flink
>   resourceVersion: "39527319"
>   selfLink:
>
> /api/v1/namespaces/gem-flink/configmaps/test-flink-etl-42557c3f6325ffc876958430859178cd-jobmanager-leader
>   uid: 70b979b5-b696-47b7-8eb8-558e8887f2c9
>
>
>
>
> --
> Sent from: http://apache-flink.147419.n8.nabble.com/
>

回复