devmadhuu commented on code in PR #10074:
URL: https://github.com/apache/ozone/pull/10074#discussion_r3200372579
##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerSyncHelper.java:
##########
@@ -19,86 +19,731 @@
import static
org.apache.hadoop.fs.CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH;
import static
org.apache.hadoop.fs.CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH_DEFAULT;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.CLEANUP;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.CLOSE;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.DELETE;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.FORCE_CLOSE;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent.QUASI_CLOSE;
import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_CONTAINER_ID_BATCH_SIZE;
import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_CONTAINER_ID_BATCH_SIZE_DEFAULT;
+import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_CONTAINER_THRESHOLD;
+import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_CONTAINER_THRESHOLD_DEFAULT;
+import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_DELETED_CONTAINER_CHECK_BATCH_SIZE;
+import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_DELETED_CONTAINER_CHECK_BATCH_SIZE_DEFAULT;
+import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_PER_STATE_DRIFT_THRESHOLD;
+import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_PER_STATE_DRIFT_THRESHOLD_DEFAULT;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import
org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline;
+import
org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
+import org.apache.hadoop.ozone.recon.metrics.ReconScmContainerSyncMetrics;
import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+/**
+ * Helper class that performs targeted incremental sync between SCM and Recon
+ * container metadata. Executes four passes per sync cycle:
+ *
+ * <ol>
+ * <li><b>Pass 1 — CLOSED (SCM-driven, add + correct):</b> fetches SCM's
+ * CLOSED container ID list, adds any absent from Recon, and corrects
+ * containers that are OPEN or CLOSING in Recon but CLOSED in SCM.</li>
+ * <li><b>Pass 2 — OPEN (SCM-driven, add only):</b> adds OPEN containers
+ * that are absent from Recon entirely (e.g., created while Recon was
+ * down).</li>
+ * <li><b>Pass 3 — QUASI_CLOSED (SCM-driven, add only):</b> adds
+ * QUASI_CLOSED containers absent from Recon. Requires that SCM returns
+ * container metadata with a null pipeline when pipeline lookup fails,
and
+ * that Recon's {@code addNewContainer} handles a null pipeline
gracefully;
+ * otherwise QUASI_CLOSED containers whose pipelines have been cleaned up
+ * will fail with {@code NullPointerException} or {@code
IOException}.</li>
+ * <li><b>Pass 4 — DELETED retirement (Recon-driven, transition only):</b>
+ * scans Recon's CLOSED and QUASI_CLOSED containers in batches, queries
+ * SCM for each, and transitions any that SCM reports as DELETED.
+ * Intentionally Recon-driven (not SCM-driven) because SCM's DELETED
+ * list grows unboundedly; starting from Recon's bounded set of
+ * non-terminal containers is always more efficient.</li>
+ * </ol>
+ */
class ReconStorageContainerSyncHelper {
// Serialized size of one ContainerID proto on the wire (varint tag + 8-byte
long = ~12 bytes).
// Used to derive the maximum batch size that fits within
ipc.maximum.data.length.
private static final long CONTAINER_ID_PROTO_SIZE_BYTES = 12;
+ /**
+ * Rotating cursor for Pass 4 (DELETED retirement). Tracks the list position
+ * where the next sync cycle should begin so that all candidates are
+ * eventually covered regardless of batch size. Volatile because it is
+ * updated by the scheduler thread and read by tests.
+ */
+ private volatile int pass4BatchOffset = 0;
+ /**
+ * Monotonic cursor for Pass 2 (OPEN add-only sync). OPEN containers are
+ * created with increasing container IDs, so each cycle only needs to scan
+ * from the last-seen ID onward rather than rescanning the full OPEN set.
+ */
+ private volatile long pass2OpenStartContainerId = 1L;
+
private static final Logger LOG = LoggerFactory
.getLogger(ReconStorageContainerSyncHelper.class);
private final StorageContainerServiceProvider scmServiceProvider;
private final OzoneConfiguration ozoneConfiguration;
private final ReconContainerManager containerManager;
+ private final ReconScmContainerSyncMetrics metrics;
+
+ /**
+ * Describes the action that the periodic scheduler should take based on the
+ * observed drift between SCM and Recon container metadata.
+ */
+ public enum SyncAction {
+ /**
+ * No drift detected — no sync work needed this cycle.
+ */
+ NO_ACTION,
+
+ /**
+ * Small or per-state drift detected — run the four-pass targeted sync.
+ * This is the normal steady-state response: cheaper than a full snapshot
+ * and sufficient for the vast majority of drift scenarios.
+ */
+ TARGETED_SYNC,
+
+ /**
+ * Large total-count drift detected — replace Recon's entire SCM DB with a
Review Comment:
It logs and add metrics for Full Snapshot, else how we'll record the events.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]