anupamaggarwal commented on code in PR #821:
URL: 
https://github.com/apache/flink-kubernetes-operator/pull/821#discussion_r1691872393


##########
flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/reconciler/snapshot/StateSnapshotReconciler.java:
##########
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.kubernetes.operator.reconciler.snapshot;
+
+import org.apache.flink.configuration.CheckpointingOptions;
+import org.apache.flink.kubernetes.operator.api.FlinkDeployment;
+import org.apache.flink.kubernetes.operator.api.FlinkStateSnapshot;
+import org.apache.flink.kubernetes.operator.api.spec.FlinkStateSnapshotSpec;
+import 
org.apache.flink.kubernetes.operator.controller.FlinkStateSnapshotContext;
+import org.apache.flink.kubernetes.operator.exception.ReconciliationException;
+import org.apache.flink.kubernetes.operator.reconciler.ReconciliationUtils;
+import 
org.apache.flink.kubernetes.operator.service.FlinkResourceContextFactory;
+import org.apache.flink.kubernetes.operator.utils.EventRecorder;
+import org.apache.flink.kubernetes.operator.utils.FlinkStateSnapshotUtils;
+import org.apache.flink.kubernetes.operator.utils.SnapshotUtils;
+import org.apache.flink.util.Preconditions;
+
+import io.javaoperatorsdk.operator.api.reconciler.DeleteControl;
+import lombok.RequiredArgsConstructor;
+import org.apache.commons.lang3.ObjectUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Optional;
+
+import static 
org.apache.flink.kubernetes.operator.api.status.FlinkStateSnapshotStatus.State.TRIGGER_PENDING;
+
+/** The reconciler for the {@link 
org.apache.flink.kubernetes.operator.api.FlinkStateSnapshot}. */
+@RequiredArgsConstructor
+public class StateSnapshotReconciler {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(StateSnapshotReconciler.class);
+
+    private final FlinkResourceContextFactory ctxFactory;
+    private final EventRecorder eventRecorder;
+
+    public void reconcile(FlinkStateSnapshotContext ctx) {
+        var resource = ctx.getResource();
+
+        var savepointState = resource.getStatus().getState();
+        if (!TRIGGER_PENDING.equals(savepointState)) {
+            return;
+        }
+
+        if (resource.getSpec().isSavepoint()
+                && resource.getSpec().getSavepoint().getAlreadyExists()) {
+            LOG.info(
+                    "Snapshot {} is marked as completed in spec, skipping 
triggering savepoint.",
+                    resource.getMetadata().getName());
+
+            FlinkStateSnapshotUtils.snapshotSuccessful(
+                    resource, resource.getSpec().getSavepoint().getPath(), 
true);
+            return;
+        }
+
+        if (FlinkStateSnapshotUtils.abandonSnapshotIfJobNotRunning(
+                ctx.getKubernetesClient(),
+                ctx.getResource(),
+                ctx.getSecondaryResource().orElse(null),
+                eventRecorder)) {
+            return;
+        }
+
+        var jobId = 
ctx.getSecondaryResource().orElseThrow().getStatus().getJobStatus().getJobId();
+
+        Optional<String> triggerIdOpt;
+        try {
+            triggerIdOpt = triggerCheckpointOrSavepoint(resource.getSpec(), 
ctx, jobId);
+        } catch (Exception e) {
+            LOG.error("Failed to trigger snapshot for resource {}", 
ctx.getResource(), e);
+            throw new ReconciliationException(e);
+        }
+
+        if (triggerIdOpt.isEmpty()) {
+            LOG.warn("Failed to trigger snapshot {}", 
resource.getMetadata().getName());
+            return;
+        }
+
+        FlinkStateSnapshotUtils.snapshotInProgress(resource, 
triggerIdOpt.get());
+    }
+
+    public DeleteControl cleanup(FlinkStateSnapshotContext ctx) throws 
Exception {
+        var resource = ctx.getResource();
+        var state = resource.getStatus().getState();
+        var resourceName = resource.getMetadata().getName();
+        LOG.info("Cleaning up resource {}...", resourceName);
+
+        if (resource.getSpec().isCheckpoint()) {
+            return DeleteControl.defaultDelete();
+        }
+        if (!resource.getSpec().getSavepoint().getDisposeOnDelete()) {
+            return DeleteControl.defaultDelete();
+        }
+        if (resource.getSpec().getJobReference() == null
+                || resource.getSpec().getJobReference().getName() == null) {
+            return DeleteControl.defaultDelete();
+        }
+
+        switch (state) {
+            case IN_PROGRESS:
+                LOG.info(
+                        "Cannot delete resource {} yet as savepoint is still 
in progress...",
+                        resourceName);
+                return DeleteControl.noFinalizerRemoval()
+                        
.rescheduleAfter(ctx.getOperatorConfig().getReconcileInterval().toMillis());
+            case COMPLETED:
+                var flinkDeployment = getFlinkDeployment(ctx);
+                return handleSnapshotCleanup(resource, flinkDeployment, ctx);

Review Comment:
    I was trying out the periodic snapshot feature today. Does it make sense to 
introduce a feature to auto-cleanup the  `flinkstatesnapshots.flink.apache.org` 
CRs and only retain upto some max retention limit (we could still retain the 
savepoint dir?) IIUC `disposeOnDelete` for periodic savepoints is set to false 
by default (so it should be relatively safe to do so). 
   
   If the user sets the savepoint interval 
`kubernetes.operator.periodic.savepoint.interval` to a low value, and there are 
multiple jobs on the cluster it would lead to creation of many CRs. I am not 
familiar with K8s internals but could this increase the load on the API server?



##########
flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/utils/FlinkStateSnapshotUtils.java:
##########
@@ -0,0 +1,382 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.kubernetes.operator.utils;
+
+import org.apache.flink.autoscaler.utils.DateTimeUtils;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.kubernetes.operator.api.AbstractFlinkResource;
+import org.apache.flink.kubernetes.operator.api.CrdConstants;
+import org.apache.flink.kubernetes.operator.api.FlinkStateSnapshot;
+import org.apache.flink.kubernetes.operator.api.spec.CheckpointSpec;
+import 
org.apache.flink.kubernetes.operator.api.spec.FlinkStateSnapshotReference;
+import org.apache.flink.kubernetes.operator.api.spec.FlinkStateSnapshotSpec;
+import org.apache.flink.kubernetes.operator.api.spec.JobReference;
+import org.apache.flink.kubernetes.operator.api.spec.SavepointSpec;
+import org.apache.flink.kubernetes.operator.api.status.CheckpointType;
+import org.apache.flink.kubernetes.operator.api.status.SavepointFormatType;
+import org.apache.flink.kubernetes.operator.api.status.SnapshotTriggerType;
+import org.apache.flink.kubernetes.operator.config.FlinkOperatorConfiguration;
+import 
org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions;
+import org.apache.flink.kubernetes.operator.reconciler.ReconciliationUtils;
+import org.apache.flink.kubernetes.operator.reconciler.SnapshotType;
+
+import io.fabric8.kubernetes.api.model.ObjectMeta;
+import io.fabric8.kubernetes.client.KubernetesClient;
+import org.apache.commons.lang3.StringUtils;
+
+import javax.annotation.Nullable;
+
+import java.time.Instant;
+import java.util.UUID;
+
+import static 
org.apache.flink.kubernetes.operator.api.status.FlinkStateSnapshotStatus.State.ABANDONED;
+import static 
org.apache.flink.kubernetes.operator.api.status.FlinkStateSnapshotStatus.State.COMPLETED;
+import static 
org.apache.flink.kubernetes.operator.api.status.FlinkStateSnapshotStatus.State.IN_PROGRESS;
+import static 
org.apache.flink.kubernetes.operator.api.status.FlinkStateSnapshotStatus.State.TRIGGER_PENDING;
+import static 
org.apache.flink.kubernetes.operator.config.KubernetesOperatorConfigOptions.SNAPSHOT_RESOURCE_ENABLED;
+import static 
org.apache.flink.kubernetes.operator.reconciler.SnapshotType.CHECKPOINT;
+import static 
org.apache.flink.kubernetes.operator.reconciler.SnapshotType.SAVEPOINT;
+
+/** Utilities class for FlinkStateSnapshot resources. */
+public class FlinkStateSnapshotUtils {
+
+    /**
+     * From a snapshot reference, return its snapshot path. If a {@link 
FlinkStateSnapshot} is
+     * referenced, it will be retrieved from Kubernetes.
+     *
+     * @param kubernetesClient kubernetes client
+     * @param snapshotRef snapshot reference
+     * @return found savepoint path
+     */
+    public static String getValidatedFlinkStateSnapshotPath(
+            KubernetesClient kubernetesClient, FlinkStateSnapshotReference 
snapshotRef) {
+        if (StringUtils.isNotBlank(snapshotRef.getPath())) {
+            return snapshotRef.getPath();
+        }
+
+        if (StringUtils.isBlank(snapshotRef.getName())) {
+            throw new IllegalArgumentException(
+                    String.format("Invalid snapshot name: %s", 
snapshotRef.getName()));
+        }
+
+        var result =
+                snapshotRef.getNamespace() == null
+                        ? kubernetesClient
+                                .resources(FlinkStateSnapshot.class)
+                                .withName(snapshotRef.getName())
+                                .get()
+                        : kubernetesClient
+                                .resources(FlinkStateSnapshot.class)
+                                .inNamespace(snapshotRef.getNamespace())
+                                .withName(snapshotRef.getName())
+                                .get();
+
+        if (result == null) {
+            throw new IllegalStateException(
+                    String.format(
+                            "Cannot find snapshot %s in namespace %s.",
+                            snapshotRef.getNamespace(), 
snapshotRef.getName()));
+        }
+
+        // We can return the savepoint path if it's marked as completed 
without waiting for the
+        // reconciler to update its status.
+        if (result.getSpec().isSavepoint() && 
result.getSpec().getSavepoint().getAlreadyExists()) {
+            var path = result.getSpec().getSavepoint().getPath();
+            if (!StringUtils.isBlank(path)) {
+                return path;
+            }
+        }
+
+        if (COMPLETED != result.getStatus().getState()) {
+            throw new IllegalStateException(
+                    String.format(
+                            "Snapshot %s/%s is not complete yet.",
+                            snapshotRef.getNamespace(), 
snapshotRef.getName()));
+        }
+
+        var path = result.getStatus().getPath();
+        if (StringUtils.isBlank(path)) {
+            throw new IllegalStateException(
+                    String.format(
+                            "Snapshot %s/%s path is incorrect: %s.",
+                            snapshotRef.getNamespace(), snapshotRef.getName(), 
path));
+        }
+
+        return path;
+    }
+
+    protected static FlinkStateSnapshot createFlinkStateSnapshot(
+            KubernetesClient kubernetesClient,
+            String name,
+            FlinkStateSnapshotSpec spec,
+            SnapshotTriggerType triggerType) {
+        var metadata = new ObjectMeta();
+        metadata.setName(name);
+        metadata.getLabels().put(CrdConstants.LABEL_SNAPSHOT_TYPE, 
triggerType.name());
+
+        var snapshot = new FlinkStateSnapshot();
+        snapshot.setSpec(spec);
+        snapshot.setMetadata(metadata);
+
+        return kubernetesClient.resource(snapshot).create();
+    }
+
+    /**
+     * Creates a checkpoint {@link FlinkStateSnapshot} resource on the 
Kubernetes cluster.
+     *
+     * @param kubernetesClient kubernetes client
+     * @param resource Flink resource associated
+     * @param savepointPath savepoint path if any
+     * @param triggerType trigger type
+     * @param savepointFormatType format type
+     * @param disposeOnDelete should dispose of data on deletion
+     * @return created snapshot
+     */
+    public static FlinkStateSnapshot createSavepointResource(
+            KubernetesClient kubernetesClient,
+            AbstractFlinkResource<?, ?> resource,
+            @Nullable String savepointPath,
+            SnapshotTriggerType triggerType,
+            SavepointFormatType savepointFormatType,
+            boolean disposeOnDelete) {
+        var savepointSpec =
+                SavepointSpec.builder()
+                        .path(savepointPath)
+                        .formatType(savepointFormatType)
+                        .disposeOnDelete(disposeOnDelete)
+                        .alreadyExists(triggerType == 
SnapshotTriggerType.UPGRADE)
+                        .build();
+
+        var snapshotSpec =
+                FlinkStateSnapshotSpec.builder()
+                        .jobReference(JobReference.fromFlinkResource(resource))
+                        .savepoint(savepointSpec)
+                        .build();
+
+        var resourceName = getFlinkStateSnapshotName(SAVEPOINT, triggerType, 
resource);
+        return createFlinkStateSnapshot(kubernetesClient, resourceName, 
snapshotSpec, triggerType);
+    }
+
+    /**
+     * Creates a checkpoint {@link FlinkStateSnapshot} resource on the 
Kubernetes cluster.
+     *
+     * @param kubernetesClient kubernetes client
+     * @param resource Flink resource associated
+     * @param checkpointType type of checkpoint
+     * @param triggerType trigger type
+     * @return created snapshot
+     */
+    public static FlinkStateSnapshot createCheckpointResource(
+            KubernetesClient kubernetesClient,
+            AbstractFlinkResource<?, ?> resource,
+            CheckpointType checkpointType,
+            SnapshotTriggerType triggerType) {
+        var checkpointSpec = 
CheckpointSpec.builder().checkpointType(checkpointType).build();
+
+        var snapshotSpec =
+                FlinkStateSnapshotSpec.builder()
+                        .jobReference(JobReference.fromFlinkResource(resource))
+                        .checkpoint(checkpointSpec)
+                        .build();
+
+        var resourceName = getFlinkStateSnapshotName(CHECKPOINT, triggerType, 
resource);
+        return createFlinkStateSnapshot(kubernetesClient, resourceName, 
snapshotSpec, triggerType);
+    }
+
+    /**
+     * Based on job configuration and operator configuration, decide if {@link 
FlinkStateSnapshot}
+     * resources should be used or not. Operator configuration will disable 
the usage of the
+     * corresponding CRD was not installed on this Kubernetes cluster.
+     *
+     * @param operatorConfiguration operator config
+     * @param configuration job config
+     * @return true if snapshot resources should be created
+     */
+    public static boolean isSnapshotResourceEnabled(
+            FlinkOperatorConfiguration operatorConfiguration, Configuration 
configuration) {
+        return configuration.get(SNAPSHOT_RESOURCE_ENABLED)
+                && operatorConfiguration.isSnapshotResourcesEnabled();
+    }
+
+    /**
+     * Return a generated name for a {@link FlinkStateSnapshot} to be created.
+     *
+     * @param snapshotType type of snapshot
+     * @param triggerType trigger type of snapshot
+     * @param referencedResource referenced resource
+     * @return result name
+     */
+    public static String getFlinkStateSnapshotName(
+            SnapshotType snapshotType,
+            SnapshotTriggerType triggerType,
+            AbstractFlinkResource<?, ?> referencedResource) {
+        return String.format(
+                "%s-%s-%s-%s",
+                referencedResource.getMetadata().getName(),
+                snapshotType.name().toLowerCase(),
+                triggerType.name().toLowerCase(),
+                UUID.randomUUID());

Review Comment:
   Hi @mateczagany, wdyt about having a timestamp / epoch value instead of the 
UUID here? It might help users figure out the latest snapshot CR in case they 
do a
   `kubectl get flinkstatesnapshots.flink.apache.org`  (and not have to pass in 
`--sort-by=.metadata.creationTimestamp` explicitly) ?
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to