Copilot commented on code in PR #9721:
URL: https://github.com/apache/ozone/pull/9721#discussion_r2791832422
##########
hadoop-ozone/mini-cluster/src/main/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java:
##########
@@ -141,6 +146,30 @@ public List<OzoneManager> getOzoneManagersList() {
return omhaService.getServices();
}
+ public void
restartOzoneManagersWithConfigCustomizer(Consumer<OzoneConfiguration>
configCustomizer)
+ throws IOException, TimeoutException, InterruptedException {
+ List<OzoneManager> toRestart = new ArrayList<>();
+ for (OzoneManager om : getOzoneManagersList()) {
+ OzoneConfiguration configuration = new
OzoneConfiguration(om.getConfiguration());
+ if (configCustomizer != null) {
+ configCustomizer.accept(configuration);
+ }
+ om.setConfiguration(configuration);
+ if (om.isRunning()) {
+ toRestart.add(om);
+ }
+ }
+ for (OzoneManager om : toRestart) {
+ if (!om.stop()) {
+ continue;
+ }
+ om.join();
+ om.restart();
+ GenericTestUtils.waitFor(om::isRunning, 1000, 30000);
+ }
Review Comment:
`restartOzoneManagersWithConfigCustomizer` silently skips restarting an OM
when `om.stop()` returns false (even though the OM was in the `toRestart`
list). This can mask a failed stop and leave the cluster partially restarted
with a mix of old/new config. Consider failing fast (throw) or at least logging
and still attempting a restart / waiting for a consistent state.
##########
hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestSnapshotBackgroundServices.java:
##########
@@ -463,6 +498,7 @@ public void
testBackupCompactionFilesPruningBackgroundService()
assertNotNull(files);
int numberOfSstFiles = files.length;
+ assertEquals(cluster.getOMLeader(), newLeaderOM);
Review Comment:
`assertEquals(cluster.getOMLeader(), newLeaderOM)` can be flaky because
`getOMLeader()` explicitly returns null when no leader is ready or when
multiple leaders appear ready. If the intent is to verify leadership, use
`cluster.waitForLeaderOM()` (and compare), or assert
`newLeaderOM.isLeaderReady()`.
```suggestion
assertEquals(cluster.waitForLeaderOM(), newLeaderOM);
```
##########
hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestSnapshotBackgroundServices.java:
##########
@@ -120,66 +126,48 @@ public void init(TestInfo testInfo) throws Exception {
conf.setInt(OMConfigKeys.OZONE_OM_RATIS_LOG_PURGE_GAP, LOG_PURGE_GAP);
conf.setStorageSize(OMConfigKeys.OZONE_OM_RATIS_SEGMENT_SIZE_KEY, 16,
StorageUnit.KB);
conf.setStorageSize(OMConfigKeys.OZONE_OM_RATIS_SEGMENT_PREALLOCATED_SIZE_KEY,
16, StorageUnit.KB);
- if ("testSSTFilteringBackgroundService".equals(testInfo.getDisplayName()))
{
- conf.setTimeDuration(OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL, 1,
- TimeUnit.SECONDS);
- }
- if ("testCompactionLogBackgroundService"
- .equals(testInfo.getDisplayName())) {
- conf.setTimeDuration(OZONE_OM_SNAPSHOT_COMPACTION_DAG_MAX_TIME_ALLOWED,
1,
- TimeUnit.MILLISECONDS);
- }
- if ("testBackupCompactionFilesPruningBackgroundService"
- .equals(testInfo.getDisplayName())) {
- conf.setTimeDuration(OZONE_OM_SNAPSHOT_COMPACTION_DAG_MAX_TIME_ALLOWED,
1,
- TimeUnit.MILLISECONDS);
- conf.setTimeDuration(
- OZONE_OM_SNAPSHOT_COMPACTION_DAG_PRUNE_DAEMON_RUN_INTERVAL, 1,
- TimeUnit.SECONDS);
- }
- if ("testSnapshotAndKeyDeletionBackgroundServices"
- .equals(testInfo.getDisplayName())) {
- conf.setTimeDuration(OZONE_BLOCK_DELETING_SERVICE_INTERVAL, 1,
- TimeUnit.SECONDS);
- conf.setTimeDuration(OZONE_SNAPSHOT_DELETING_SERVICE_INTERVAL, 1,
- TimeUnit.SECONDS);
- conf.setTimeDuration(OZONE_OM_SNAPSHOT_COMPACTION_DAG_MAX_TIME_ALLOWED,
1,
- TimeUnit.MILLISECONDS);
- conf.setTimeDuration(
- OZONE_OM_SNAPSHOT_COMPACTION_DAG_PRUNE_DAEMON_RUN_INTERVAL, 3,
- TimeUnit.SECONDS);
- conf.setTimeDuration(OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL, 3,
- TimeUnit.SECONDS);
- }
+
+ // Used by: testSSTFilteringBackgroundService
+ conf.setTimeDuration(OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL, 1,
TimeUnit.SECONDS);
+
+ // Used by: testCompactionLogBackgroundService,
testBackupCompactionFilesPruningBackgroundService,
+ //
testSnapshotAndKeyDeletionBackgroundServices
+ conf.setTimeDuration(OZONE_OM_SNAPSHOT_COMPACTION_DAG_MAX_TIME_ALLOWED, 1,
TimeUnit.MILLISECONDS);
+
+ // Used by: testCompactionLogBackgroundService,
testBackupCompactionFilesPruningBackgroundService
+
conf.setTimeDuration(OZONE_OM_SNAPSHOT_COMPACTION_DAG_PRUNE_DAEMON_RUN_INTERVAL,
3, TimeUnit.SECONDS);
+
+ // Used by: testSnapshotAndKeyDeletionBackgroundServices
+ conf.setTimeDuration(OZONE_BLOCK_DELETING_SERVICE_INTERVAL, 1,
TimeUnit.SECONDS);
+ // Used by: testSnapshotAndKeyDeletionBackgroundServices
+ conf.setTimeDuration(OZONE_SNAPSHOT_DELETING_SERVICE_INTERVAL, 1,
TimeUnit.SECONDS);
+
conf.setLong(
OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY,
SNAPSHOT_THRESHOLD);
int numOfOMs = 3;
cluster = MiniOzoneCluster.newHABuilder(conf)
- .setOMServiceId("om-service-test1")
+ .setOMServiceId(omServiceId)
.setNumOfOzoneManagers(numOfOMs)
- .setNumOfActiveOMs(2)
+ .setNumOfActiveOMs(numOfOMs)
.build();
- if ("testBackupCompactionFilesPruningBackgroundService"
- .equals(testInfo.getDisplayName())) {
- cluster.
- getOzoneManagersList()
- .forEach(
- TestSnapshotBackgroundServices
- ::suspendBackupCompactionFilesPruning);
- }
+
cluster.waitForClusterToBeReady();
client = OzoneClientFactory.getRpcClient(omServiceId, conf);
objectStore = client.getObjectStore();
+ }
+
+ @BeforeEach
+ public void setupTest() throws IOException, InterruptedException,
TimeoutException {
+ recoverCluster();
+ stopFollowerOM(cluster.getOMLeader());
Review Comment:
`setupTest` passes `cluster.getOMLeader()` into `stopFollowerOM(...)`, but
`getOMLeader()` can legally return null transiently (eg during leader
transitions) and would cause the helper to stop an arbitrary OM. Prefer using
the result of `cluster.waitForLeaderOM()` (or assert non-null) to avoid test
flakiness.
```suggestion
OzoneManager leader = cluster.waitForLeaderOM();
stopFollowerOM(leader);
```
##########
hadoop-ozone/mini-cluster/src/main/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java:
##########
@@ -141,6 +146,30 @@ public List<OzoneManager> getOzoneManagersList() {
return omhaService.getServices();
}
+ public void
restartOzoneManagersWithConfigCustomizer(Consumer<OzoneConfiguration>
configCustomizer)
+ throws IOException, TimeoutException, InterruptedException {
+ List<OzoneManager> toRestart = new ArrayList<>();
+ for (OzoneManager om : getOzoneManagersList()) {
+ OzoneConfiguration configuration = new
OzoneConfiguration(om.getConfiguration());
+ if (configCustomizer != null) {
+ configCustomizer.accept(configuration);
+ }
+ om.setConfiguration(configuration);
+ if (om.isRunning()) {
Review Comment:
Access of [element](1) annotated with VisibleForTesting found in production
code.
##########
hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestSnapshotBackgroundServices.java:
##########
@@ -357,24 +370,39 @@ private void createSnapshotsEachWithNewKeys(OzoneManager
ozoneManager)
}
}
- private OzoneManager getInactiveFollowerOM(OzoneManager leaderOM) {
- String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
- if (cluster.isOMActive(followerNodeId)) {
- followerNodeId = leaderOM.getPeerNodes().get(1).getNodeId();
- }
- return cluster.getOzoneManager(followerNodeId);
+ private OzoneManager getInactiveFollowerOM(OzoneManager leaderOM)
+ throws TimeoutException, InterruptedException {
+ // Wait for an inactive OM to be available
+ AtomicReference<OzoneManager> inactiveOM = new AtomicReference<>();
+ GenericTestUtils.waitFor(() -> {
+ Iterator<OzoneManager> iterator = cluster.getInactiveOM();
+ if (iterator.hasNext()) {
+ inactiveOM.set(iterator.next());
+ return true;
Review Comment:
`getInactiveFollowerOM` no longer uses the `leaderOM` parameter. This makes
the signature misleading and may trigger unused-parameter warnings in some
builds/checkstyle configs. Consider removing the parameter (and updating
callers) or using it to ensure you don't accidentally pick the leader as the
"inactive" OM.
```suggestion
// Wait for an inactive OM (that is not the leader) to be available
AtomicReference<OzoneManager> inactiveOM = new AtomicReference<>();
GenericTestUtils.waitFor(() -> {
Iterator<OzoneManager> iterator = cluster.getInactiveOM();
while (iterator.hasNext()) {
OzoneManager candidate = iterator.next();
// Ensure we don't accidentally pick the leader as the "inactive" OM.
if (!Objects.equals(candidate, leaderOM)) {
inactiveOM.set(candidate);
return true;
}
```
##########
hadoop-ozone/mini-cluster/src/main/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java:
##########
@@ -141,6 +146,30 @@ public List<OzoneManager> getOzoneManagersList() {
return omhaService.getServices();
}
+ public void
restartOzoneManagersWithConfigCustomizer(Consumer<OzoneConfiguration>
configCustomizer)
+ throws IOException, TimeoutException, InterruptedException {
+ List<OzoneManager> toRestart = new ArrayList<>();
+ for (OzoneManager om : getOzoneManagersList()) {
+ OzoneConfiguration configuration = new
OzoneConfiguration(om.getConfiguration());
+ if (configCustomizer != null) {
+ configCustomizer.accept(configuration);
+ }
+ om.setConfiguration(configuration);
Review Comment:
Access of [element](1) annotated with VisibleForTesting found in production
code.
##########
hadoop-ozone/mini-cluster/src/main/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java:
##########
@@ -141,6 +146,30 @@ public List<OzoneManager> getOzoneManagersList() {
return omhaService.getServices();
}
+ public void
restartOzoneManagersWithConfigCustomizer(Consumer<OzoneConfiguration>
configCustomizer)
+ throws IOException, TimeoutException, InterruptedException {
+ List<OzoneManager> toRestart = new ArrayList<>();
+ for (OzoneManager om : getOzoneManagersList()) {
+ OzoneConfiguration configuration = new
OzoneConfiguration(om.getConfiguration());
+ if (configCustomizer != null) {
+ configCustomizer.accept(configuration);
+ }
+ om.setConfiguration(configuration);
+ if (om.isRunning()) {
+ toRestart.add(om);
+ }
+ }
+ for (OzoneManager om : toRestart) {
+ if (!om.stop()) {
+ continue;
+ }
+ om.join();
+ om.restart();
+ GenericTestUtils.waitFor(om::isRunning, 1000, 30000);
Review Comment:
Access of [element](1) annotated with VisibleForTesting found in production
code.
```suggestion
long startTime = System.currentTimeMillis();
while (!om.isRunning()) {
if (System.currentTimeMillis() - startTime > 30000) {
throw new TimeoutException("Timed out waiting for OzoneManager to
start.");
}
Thread.sleep(1000);
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]