This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new dcd767d91a0 [regression-test](backup-restore) wait for colocate group 
to stabilize before asserting COLOCATE plan (#64532)
dcd767d91a0 is described below

commit dcd767d91a0016033902295cd680b982e4bbb57d
Author: shuke <[email protected]>
AuthorDate: Tue Jun 16 16:11:29 2026 +0800

    [regression-test](backup-restore) wait for colocate group to stabilize 
before asserting COLOCATE plan (#64532)
    
    ## Problem
    
    `test_backup_restore_colocate_with_partition` (in
    `regression-test/suites/backup_restore/test_backup_restore_colocate.groovy`)
    flakily fails right after a `RESTORE`:
    
    ```
    Explain and check failed, expect contains 'COLOCATE', but actual explain 
string is:
      HAS_COLO_PLAN_NODE: false
      3:VHASH JOIN(331)
      |  join op: INNER JOIN(BROADCAST)[]
      TABLE: ..._db_new..._table1
    ```
    
    (reproduced deterministically in an isolated run on a 4-BE / force-3
    cluster; failure was on the restore-to-new-db case.)
    
    ## Root cause — a case timing bug, not a plan regression
    
    After `RESTORE` the restored colocate group needs a moment to become
    **stable**, and the planner only emits a `COLOCATE` join once the group
    is stable (otherwise it falls back to `BROADCAST`/shuffle). The suite
    ran `explain ... contains("COLOCATE")` **immediately** after
    `waitAllRestoreFinish`, racing the stabilization. The existing
    `checkColocateTabletHealth` (a single-shot `ColocateMismatchNum == 0`
    assert) sat *after* the assertion, so it didn't gate the explain.
    
    ## Fix
    
    - Add a bounded poll `waitColocatePlan(query)` (60 × 1s) that waits for
    the explain plan to actually contain `COLOCATE`, and call it before each
    `contains("COLOCATE")` assertion.
    - Turn `checkColocateTabletHealth` into a bounded poll as well, so the
    health check waits for stabilization instead of racing it.
    - Applied symmetrically to both suites in the file; the
    `notContains("COLOCATE")` assertions are untouched.
    
    If `COLOCATE` never appears within the timeout, the assertion still
    fails — so a genuine regression would not be masked.
    
    ## Verification
    
    Run isolated on the cluster where the original case failed
    deterministically: the fixed case passes (`Test 1 suites, failed 0
    suites`).
    
    Co-authored-by: Claude Opus 4.8 (1M context) <[email protected]>
---
 .../test_backup_restore_colocate.groovy            | 55 +++++++++++++++++++++-
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git 
a/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy 
b/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy
index 98262234edf..20588be01e9 100644
--- a/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy
+++ b/regression-test/suites/backup_restore/test_backup_restore_colocate.groovy
@@ -35,12 +35,35 @@ suite("test_backup_restore_colocate", 
"backup_restore,external") {
     }
 
     def checkColocateTabletHealth = { db_name ->
-        def result = showTabletHealth.call(db_name)
+        // Poll until the colocate group has stabilized (no mismatch) instead 
of
+        // asserting once, so the check waits for stabilization (e.g. after a 
restore)
+        // rather than racing it.
+        def result = null
+        for (int i = 0; i < 60; i++) {
+            result = showTabletHealth.call(db_name)
+            if (result != null && (result.ColocateMismatchNum as int) == 0) {
+                break
+            }
+            sleep(1000)
+        }
         log.info(result as String)
         assertNotNull(result)
         assertTrue(result.ColocateMismatchNum as int == 0)
     }
 
+    // The planner only produces a COLOCATE join once the colocate group is 
stable.
+    // Right after a restore the restored group may still be stabilizing, so 
poll the
+    // explain plan until COLOCATE shows up (bounded wait) before asserting on 
it.
+    def waitColocatePlan = { q ->
+        def plan = q.replaceAll(/;\s*$/, "")
+        for (int i = 0; i < 60; i++) {
+            if (sql("explain ${plan}").toString().contains("COLOCATE")) {
+                break
+            }
+            sleep(1000)
+        }
+    }
+
     def syncer = getSyncer()
     syncer.createS3Repository(repoName)
 
@@ -95,6 +118,7 @@ suite("test_backup_restore_colocate", 
"backup_restore,external") {
     res = sql "SELECT * FROM ${dbName}.${tableName2}"
     assertEquals(res.size(), insert_num)
 
+    waitColocatePlan(query)
     explain {
         sql("${query}")
         contains("COLOCATE")
@@ -201,6 +225,7 @@ suite("test_backup_restore_colocate", 
"backup_restore,external") {
     assertEquals(res.size(), insert_num)
 
 
+    waitColocatePlan(query)
     explain {
         sql("${query}")
         contains("COLOCATE")
@@ -370,12 +395,35 @@ suite("test_backup_restore_colocate_with_partition", 
"backup_restore") {
     }
 
     def checkColocateTabletHealth = { db_name ->
-        def result = showTabletHealth.call(db_name)
+        // Poll until the colocate group has stabilized (no mismatch) instead 
of
+        // asserting once, so the check waits for stabilization (e.g. after a 
restore)
+        // rather than racing it.
+        def result = null
+        for (int i = 0; i < 60; i++) {
+            result = showTabletHealth.call(db_name)
+            if (result != null && (result.ColocateMismatchNum as int) == 0) {
+                break
+            }
+            sleep(1000)
+        }
         log.info(result as String)
         assertNotNull(result)
         assertTrue(result.ColocateMismatchNum as int == 0)
     }
 
+    // The planner only produces a COLOCATE join once the colocate group is 
stable.
+    // Right after a restore the restored group may still be stabilizing, so 
poll the
+    // explain plan until COLOCATE shows up (bounded wait) before asserting on 
it.
+    def waitColocatePlan = { q ->
+        def plan = q.replaceAll(/;\s*$/, "")
+        for (int i = 0; i < 60; i++) {
+            if (sql("explain ${plan}").toString().contains("COLOCATE")) {
+                break
+            }
+            sleep(1000)
+        }
+    }
+
     def syncer = getSyncer()
     syncer.createS3Repository(repoName)
 
@@ -446,6 +494,7 @@ suite("test_backup_restore_colocate_with_partition", 
"backup_restore") {
     res = sql "SELECT * FROM ${dbName}.${tableName2}"
     assertEquals(res.size(), insert_num)
 
+    waitColocatePlan(query)
     explain {
         sql("${query}")
         contains("COLOCATE")
@@ -550,6 +599,7 @@ suite("test_backup_restore_colocate_with_partition", 
"backup_restore") {
     assertEquals(res.size(), insert_num)
 
 
+    waitColocatePlan(query)
     explain {
         sql("${query}")
         contains("COLOCATE")
@@ -624,6 +674,7 @@ suite("test_backup_restore_colocate_with_partition", 
"backup_restore") {
 
     query = "select * from ${newDbName}.${tableName1} as t1, 
${newDbName}.${tableName2} as t2 where t1.id=t2.id;"
 
+    waitColocatePlan(query)
     explain {
         sql("${query}")
         contains("COLOCATE")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to