rakeshadr commented on code in PR #10384:
URL: https://github.com/apache/ozone/pull/10384#discussion_r3370664360
##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/metrics/ReconScmContainerSyncMetrics.java:
##########
@@ -75,19 +107,115 @@ public void unRegister() {
ms.unregisterSource(SOURCE_NAME);
}
- public void setTargetedSyncStatus(int status) {
- targetedSyncStatus.set(status);
+ public void setScmContainerSyncStatus(int status) {
+ scmContainerSyncStatus.set(status);
}
public void setLastTargetedSyncDurationMs(long durationMs) {
lastTargetedSyncDurationMs.set(durationMs);
}
- public int getTargetedSyncStatus() {
- return targetedSyncStatus.value();
+ public void setContainerSyncDurationMs(
+ HddsProtos.LifeCycleState state, long durationMs) {
+ setStateGauge(containerSyncDurationMs, state, durationMs);
+ }
+
+ public void setContainerCountDrift(
+ HddsProtos.LifeCycleState state, long drift) {
+ setStateGauge(containerCountDrift, state, drift);
+ }
+
+ public int getScmContainerSyncStatus() {
+ return scmContainerSyncStatus.get();
}
public long getLastTargetedSyncDurationMs() {
- return lastTargetedSyncDurationMs.value();
+ return lastTargetedSyncDurationMs.get();
+ }
+
+ public long getContainerSyncDurationMs(
+ HddsProtos.LifeCycleState state) {
+ return getStateGauge(containerSyncDurationMs, state);
+ }
+
+ public long getContainerCountDrift(
+ HddsProtos.LifeCycleState state) {
+ return getStateGauge(containerCountDrift, state);
+ }
+
+ @Override
+ public void getMetrics(MetricsCollector collector, boolean all) {
+ MetricsRecordBuilder builder = collector.addRecord(SOURCE_NAME);
+ builder.addGauge(SCM_CONTAINER_SYNC_STATUS, getScmContainerSyncStatus());
+ builder.addGauge(LAST_TARGETED_SYNC_DURATION_MS,
+ getLastTargetedSyncDurationMs());
+ for (HddsProtos.LifeCycleState state : SYNC_STATES) {
+ builder.addGauge(containerSyncDurationMetricInfo.get(state),
+ getContainerSyncDurationMs(state));
+ builder.addGauge(containerCountDriftMetricInfo.get(state),
+ getContainerCountDrift(state));
+ }
+ }
+
+ private static Map<HddsProtos.LifeCycleState, AtomicLong>
+ initStateGaugeValues() {
+ Map<HddsProtos.LifeCycleState, AtomicLong> gauges =
+ new EnumMap<>(HddsProtos.LifeCycleState.class);
+ for (HddsProtos.LifeCycleState state : SYNC_STATES) {
+ gauges.put(state, new AtomicLong());
+ }
+ return Collections.unmodifiableMap(gauges);
+ }
+
+ private static Map<HddsProtos.LifeCycleState, MetricsInfo>
+ initSyncDurationMetricInfo() {
+ Map<HddsProtos.LifeCycleState, MetricsInfo> metrics =
+ new EnumMap<>(HddsProtos.LifeCycleState.class);
+ for (HddsProtos.LifeCycleState state : SYNC_STATES) {
+ String stateName = metricStateName(state);
+ metrics.put(state, Interns.info(
+ CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, stateName)
+ + "ContainerSyncDurationMs",
+ "Time taken by the " + stateName
+ + " container sync pass in milliseconds"));
+ }
+ return Collections.unmodifiableMap(metrics);
+ }
+
+ private static Map<HddsProtos.LifeCycleState, MetricsInfo>
+ initCountDriftMetricInfo() {
+ Map<HddsProtos.LifeCycleState, MetricsInfo> metrics =
+ new EnumMap<>(HddsProtos.LifeCycleState.class);
+ for (HddsProtos.LifeCycleState state : SYNC_STATES) {
+ String stateName = metricStateName(state);
+ metrics.put(state, Interns.info(
+ CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, stateName)
+ + "ContainerCountDrift",
+ "Pre-sync observed " + stateName
Review Comment:
Instead of "Pre-sync observed", please rename this to make it precise,
suggestion below:
```
"Pre-sync observed " + stateName
+ " container count drift, computed as SCM count minus Recon
count"));
to
"Container count drift observed at start of sync pass "
+ "(SCM count minus Recon count for " + stateName + "
state)."));
```
##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerSyncHelper.java:
##########
Review Comment:
In worst case with a very high drift or with a high latency rpc call, this
can be a long running. Please add log message for better troubleshooting.
Probably, can make the progress info log every 50th or 100th batch, to reduce
log noise.
```
int reconciledCount = 0;
int batchNum = 0;
LOG.info("{} sync starting: total={}, batchSize={}, startId={}.",
scmState, total, batchSize, initialStart);
while (true) {
List<ContainerID> batch = scmServiceProvider.getListOfContainerIDs(
startContainerId, batchSize, scmState);
.....
.....
.....
.....
startContainerId = ContainerID.valueOf(nextID);
retrieved += batch.size();
if (batchNum % 50 == 0) {
LOG.info("{} sync progress: batch={}, totalRetrieved={}, added={},
reconciled={}, nextId={}.",
scmState, batchNum, retrieved, addedCount, reconciledCount,
nextID);
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]