J-HowHuang commented on code in PR #15618:
URL: https://github.com/apache/pinot/pull/15618#discussion_r2072234211


##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java:
##########
@@ -1303,47 +1303,75 @@ private IdealState waitForExternalViewToConverge(String 
tableNameWithType, boole
     long endTimeMs = System.currentTimeMillis() + 
externalViewStabilizationTimeoutInMs;
 
     IdealState idealState;
-    do {
-      tableRebalanceLogger.debug("Start to check if ExternalView converges to 
IdealStates");
-      idealState = 
_helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().idealStates(tableNameWithType));
-      // IdealState might be null if table got deleted, throwing exception to 
abort the rebalance
-      Preconditions.checkState(idealState != null, "Failed to find the 
IdealState");
-
-      ExternalView externalView =
-          
_helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().externalView(tableNameWithType));
-      // ExternalView might be null when table is just created, skipping check 
for this iteration
-      if (externalView != null) {
-        // Record external view and ideal state convergence status
-        TableRebalanceObserver.RebalanceContext rebalanceContext = new 
TableRebalanceObserver.RebalanceContext(
-            estimateAverageSegmentSizeInBytes, allSegmentsFromIdealState, 
segmentsToMonitor);
-        _tableRebalanceObserver.onTrigger(
-            
TableRebalanceObserver.Trigger.EXTERNAL_VIEW_TO_IDEAL_STATE_CONVERGENCE_TRIGGER,
-            externalView.getRecord().getMapFields(), 
idealState.getRecord().getMapFields(), rebalanceContext);
-        // Update unique segment list as IS-EV trigger must have processed 
these
-        allSegmentsFromIdealState = 
idealState.getRecord().getMapFields().keySet();
-        if (_tableRebalanceObserver.isStopped()) {
-          throw new RuntimeException(
-              String.format("Rebalance has already stopped with status: %s", 
_tableRebalanceObserver.getStopStatus()));
+    ExternalView externalView;
+    int previousRemainingSegments = -1;
+    while (true) {
+      do {
+        tableRebalanceLogger.debug("Start to check if ExternalView converges 
to IdealStates");
+        idealState = 
_helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().idealStates(tableNameWithType));
+        // IdealState might be null if table got deleted, throwing exception 
to abort the rebalance
+        Preconditions.checkState(idealState != null, "Failed to find the 
IdealState");
+
+        externalView = 
_helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().externalView(tableNameWithType));
+        // ExternalView might be null when table is just created, skipping 
check for this iteration
+        if (externalView != null) {
+          // Record external view and ideal state convergence status
+          TableRebalanceObserver.RebalanceContext rebalanceContext = new 
TableRebalanceObserver.RebalanceContext(
+              estimateAverageSegmentSizeInBytes, allSegmentsFromIdealState, 
segmentsToMonitor);
+          _tableRebalanceObserver.onTrigger(
+              
TableRebalanceObserver.Trigger.EXTERNAL_VIEW_TO_IDEAL_STATE_CONVERGENCE_TRIGGER,
+              externalView.getRecord().getMapFields(), 
idealState.getRecord().getMapFields(), rebalanceContext);
+          // Update unique segment list as IS-EV trigger must have processed 
these
+          allSegmentsFromIdealState = 
idealState.getRecord().getMapFields().keySet();
+          if (_tableRebalanceObserver.isStopped()) {
+            throw new RuntimeException(
+                String.format("Rebalance has already stopped with status: %s",
+                    _tableRebalanceObserver.getStopStatus()));
+          }
+          if (previousRemainingSegments < 0) {
+            // initialize previousRemainingSegments
+            previousRemainingSegments = 
getNumRemainingSegmentsToProcess(tableNameWithType,
+                externalView.getRecord().getMapFields(), 
idealState.getRecord().getMapFields(), lowDiskMode,
+                bestEfforts, segmentsToMonitor, tableRebalanceLogger, false);
+            if (previousRemainingSegments == 0) {
+              tableRebalanceLogger.info("ExternalView converged");
+              return idealState;
+            }
+          } else if (isExternalViewConverged(tableNameWithType, 
externalView.getRecord().getMapFields(),
+              idealState.getRecord().getMapFields(), lowDiskMode, bestEfforts, 
segmentsToMonitor,
+              tableRebalanceLogger)) {
+            tableRebalanceLogger.info("ExternalView converged");
+            return idealState;
+          }
         }
-        if (isExternalViewConverged(tableNameWithType, 
externalView.getRecord().getMapFields(),
-            idealState.getRecord().getMapFields(), lowDiskMode, bestEfforts, 
segmentsToMonitor, tableRebalanceLogger)) {
-          tableRebalanceLogger.info("ExternalView converged");
-          return idealState;
+        tableRebalanceLogger.debug("ExternalView has not converged to 
IdealStates. Retry after: {}ms",
+            externalViewCheckIntervalInMs);
+        Thread.sleep(externalViewCheckIntervalInMs);
+      } while (System.currentTimeMillis() < endTimeMs);
+      if (bestEfforts) {
+        tableRebalanceLogger.warn(
+            "ExternalView has not converged within: {}ms, continuing the 
rebalance (best-efforts)",
+            externalViewStabilizationTimeoutInMs);
+        return idealState;
+      }
+      if (externalView != null) {
+        int currentRemainingSegments = 
getNumRemainingSegmentsToProcess(tableNameWithType,
+            externalView.getRecord().getMapFields(), 
idealState.getRecord().getMapFields(), lowDiskMode, bestEfforts,
+            segmentsToMonitor, tableRebalanceLogger, false);
+        if (currentRemainingSegments < previousRemainingSegments) {
+          tableRebalanceLogger.info(
+              "Extending EV stabilization timeout for another {}ms, remaining 
{} segments to be processed.",
+              externalViewStabilizationTimeoutInMs, currentRemainingSegments);

Review Comment:
   Added. Example (see PR description for full log):
   ```
   2025/05/02 16:13:36.141 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 920 segments to be processed.
   2025/05/02 16:13:51.338 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 801 segments to be processed.
   2025/05/02 16:14:06.531 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 698 segments to be processed.
   2025/05/02 16:14:21.749 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 594 segments to be processed.
   2025/05/02 16:14:36.956 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 480 segments to be processed.
   2025/05/02 16:14:52.157 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 364 segments to be processed.
   2025/05/02 16:15:07.389 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 261 segments to be processed.
   2025/05/02 16:15:22.616 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 152 segments to be processed.
   2025/05/02 16:15:37.812 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] Extending EV stabilization timeout for 
another 15000ms, remaining 39 segments to be processed.
   2025/05/02 16:15:42.883 INFO 
[TableRebalancer-jhow_OFFLINE-b646148f-18de-4bdd-a79d-d5cfa6a10359] 
[jersey-server-managed-async-executor-0] ExternalView converged in 141937ms, 
with 9 extensions
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to