gyfora commented on code in PR #997: URL: https://github.com/apache/flink-kubernetes-operator/pull/997#discussion_r2204032353
########## flink-kubernetes-operator-api/src/main/java/org/apache/flink/kubernetes/operator/api/status/CommonStatus.java: ########## @@ -90,6 +90,28 @@ public ResourceLifecycleState getLifecycleState() { return ResourceLifecycleState.FAILED; } + // Check for unrecoverable deployments that should be marked as FAILED + if (this instanceof FlinkDeploymentStatus) { + FlinkDeploymentStatus deploymentStatus = (FlinkDeploymentStatus) this; + var jmDeployStatus = deploymentStatus.getJobManagerDeploymentStatus(); + + // ERROR/MISSING deployments are in terminal error state + // [Configmaps deleted -> require manual restore] and should always be FAILED + if ((jmDeployStatus == JobManagerDeploymentStatus.MISSING + || jmDeployStatus == JobManagerDeploymentStatus.ERROR) + && StringUtils.isNotEmpty(error) + && (error.toLowerCase() + .contains( + "it is possible that the job has finished or terminally failed, or the configmaps have been deleted") + || error.toLowerCase().contains("manual restore required") + || error.toLowerCase().contains("ha metadata not available") + || error.toLowerCase() + .contains( + "ha data is not available to make stateful upgrades"))) { Review Comment: Why are we checking this specific error? In any case we are the ones triggering this error so please create a constant in the `AbstractJobReconciler` and use that here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org