Never seen this before but also you should not set the cluster-id in your
config as that should be controlled by the operator itself.

Gyula

On Fri, Mar 31, 2023 at 2:39 PM Pierre Bedoucha <pierre.bedou...@tv2.no>
wrote:

> Hi,
>
>
>
> We are trying to use Flink Kubernetes Operator 1.4.0 with Flink 1.16.
>
>
>
> However, at the job-manager deployment step we get the following error:
> ```
>
> Exception in thread "main" java.lang.NullPointerException
>
>         at
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint.shutDownAsync(ClusterEntrypoint.java:585)
>
>         at
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint.startCluster(ClusterEntrypoint.java:242)
>
>         at
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint.runClusterEntrypoint(ClusterEntrypoint.java:729)
>
>         at
> org.apache.flink.kubernetes.entrypoint.KubernetesApplicationClusterEntrypoint.main(KubernetesApplicationClusterEntrypoint.java:86)
>
>
>
> ```
> It sems it is related to the following line:
>
> ```
>
> this.clusterId =
> checkNotNull(flinkConfig.getString(KubernetesConfigOptions.CLUSTER_ID),
> "ClusterId must be specified!");
>
> ```
> We specified the CLUSTER_ID but it seems that the flinkConfig object is
> not handled correctly.
>
> We have the following flinkConfiguration defined in deployment.yaml:
> ```
> spec:
>
>   flinkConfiguration:
>
>     execution.checkpointing.externalized-checkpoint-retention:
> RETAIN_ON_CANCELLATION
>
>     execution.checkpointing.interval: 120s
>
>     execution.checkpointing.min-pause: 120s
>
>     execution.checkpointing.mode: AT_LEAST_ONCE
>
>     execution.checkpointing.snapshot-compression: "false"
>
>     execution.checkpointing.timeout: 3000s
>
>     execution.checkpointing.tolerable-failed-checkpoints: "5"
>
>     execution.checkpointing.unaligned: "false"
>
>     fs.hdfs.hadoopconf: /opt/hadoop-conf/
>
>     high-availability.storageDir: gs://<path/to/environment>/ha
>
>     high-availability: kubernetes
>
>     high-availability.cluster-id: <cluster-id>
>
>     kubernetes.operator.periodic.savepoint.interval: 6h
>
>     kubernetes.operator.savepoint.history.max.age: 72h
>
>     kubernetes.operator.savepoint.history.max.count: "15"
>
>     metrics.reporter.prom.class:
> org.apache.flink.metrics.prometheus.PrometheusReporter
>
>     metrics.reporter.prom.port: "2112"
>
>     metrics.reporters: prom
>
>     rest.flamegraph.enabled: "false"
>
>     state.backend: rocksdb
>
>     state.backend.incremental: "false"
>
>     state.backend.rocksdb.localdir: /rocksdb
>
>     state.checkpoint-storage: filesystem
>
>     state.checkpoints.dir: gs://<path/to/environment>/checkpoints
>
>     state.savepoints.dir: gs://<path/to/environment>/savepoints
>
>     taskmanager.memory.managed.fraction: "0"
>
>     taskmanager.network.memory.buffer-debloat.enabled: "false"
>
>     taskmanager.network.memory.buffer-debloat.period: "200"
>
>     taskmanager.network.memory.buffers-per-channel: "2"
>
>     taskmanager.network.memory.floating-buffers-per-gate: "8"
>
>     taskmanager.network.memory.max-buffers-per-channel: "10"
>
>     taskmanager.network.sort-shuffle.min-buffers: "512"
>
>     taskmanager.numberOfTaskSlots: "1"
>
>     kubernetes.taskmanager.cpu.limit-factor: "4"
>
>     kubernetes.taskmanager.cpu: "0.5"
>
>     kubernetes.cluster-id: <cluster-id>
>
> ```
> Have someone encountered the issue before?
>
> Thanks,
> PB
>

Reply via email to