sumitagrawl commented on code in PR #10074:
URL: https://github.com/apache/ozone/pull/10074#discussion_r3223962757
##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java:
##########
@@ -432,34 +437,72 @@ public void start() {
} else {
initializePipelinesFromScm();
}
- LOG.debug("Started the SCM Container Info sync scheduler.");
- long interval = ozoneConfiguration.getTimeDuration(
- OZONE_RECON_SCM_SNAPSHOT_TASK_INTERVAL_DELAY,
- OZONE_RECON_SCM_SNAPSHOT_TASK_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS);
- long initialDelay = ozoneConfiguration.getTimeDuration(
- OZONE_RECON_SCM_SNAPSHOT_TASK_INITIAL_DELAY,
- OZONE_RECON_SCM_SNAPSHOT_TASK_INITIAL_DELAY_DEFAULT,
+ // -----------------------------------------------------------------------
+ // Scheduler (incremental/targeted sync): runs every 1h (default).
+ //
+ // Each cycle calls decideSyncAction() — two lightweight count RPCs to SCM
+ // — and then:
+ //
+ // non-OPEN drift > threshold (default 1,000,000)
+ // → warn and expose the drift via metrics; full snapshot is not
+ // downloaded automatically by this periodic task
+ //
+ // 0 < |total drift| <= threshold
+ // → targeted sync: 4-pass incremental repair
+ //
+ // total drift = 0 but per-state drift (OPEN, QUASI_CLOSED, or CLOSED)
+ // >= threshold (default 1)
+ // → targeted sync: corrects containers stuck in a stale lifecycle
state
+ //
+ // no drift detected
+ // → no action this cycle
+ //
+ // Running this on a 1h cadence means container state discrepancies are
+ // detected and corrected without an unconditional periodic full snapshot.
+ // -----------------------------------------------------------------------
+ long syncInterval = ozoneConfiguration.getTimeDuration(
+ OZONE_RECON_SCM_CONTAINER_SYNC_TASK_INTERVAL_DELAY,
+ OZONE_RECON_SCM_CONTAINER_SYNC_TASK_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS);
+ long syncInitialDelay = ozoneConfiguration.getTimeDuration(
+ OZONE_RECON_SCM_CONTAINER_SYNC_TASK_INITIAL_DELAY,
+ OZONE_RECON_SCM_CONTAINER_SYNC_TASK_INITIAL_DELAY_DEFAULT,
TimeUnit.MILLISECONDS);
- // This periodic sync with SCM container cache is needed because during
- // the window when recon will be down and any container being added
- // newly and went missing, that container will not be reported as missing
by
- // recon till there is a difference of container count equivalent to
- // threshold value defined in "ozone.recon.scm.container.threshold"
- // between SCM container cache and recon container cache.
+ LOG.debug("Started the SCM Container Info sync scheduler (interval={}ms,
initialDelay={}ms).",
+ syncInterval, syncInitialDelay);
scheduler.scheduleWithFixedDelay(() -> {
+ if (!isSyncDataFromSCMRunning.compareAndSet(false, true)) {
+ LOG.debug("SCM container info sync is already running; skipping this
cycle.");
+ return;
+ }
try {
- boolean isSuccess = syncWithSCMContainerInfo();
- if (!isSuccess) {
- LOG.debug("SCM container info sync is already running.");
+ ReconStorageContainerSyncHelper.SyncAction action =
+ containerSyncHelper.decideSyncAction();
+ switch (action) {
+ case LARGE_DRIFT_THRESHOLD_EXCEEDED:
+ break;
Review Comment:
Instead of full sync, we can have targeted sync only here
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]