This is an automated email from the ASF dual-hosted git repository.

morrySnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d77856dc715 [improvement](statistics) Collect hot value during full 
statistics analysis (#62435)
d77856dc715 is described below

commit d77856dc71585da845d3646cf9c20d2fd94c0c48
Author: yujun <[email protected]>
AuthorDate: Mon May 18 18:10:09 2026 +0800

    [improvement](statistics) Collect hot value during full statistics analysis 
(#62435)
    
    ### What problem does this PR solve?
    
    Problem Summary: Previously, SQL-based full statistics collection
    (`ANALYZE TABLE ... WITH SYNC`)
    hardcoded `null as hot_value` in `FULL_ANALYZE_TEMPLATE`, meaning hot
    values (frequent
    values) were only collected during sample-based analysis. This made
    SQL-based full analysis
    produce less complete statistics than sample analysis.
    
    This change rewrites `FULL_ANALYZE_TEMPLATE` to use a CTE-based
    structure (matching
    `LINEAR_ANALYZE_TEMPLATE` and `DUJ1_ANALYZE_TEMPLATE` patterns) that
    computes hot values
    via GROUP BY + TOP-N aggregation. Both `OlapAnalysisTask` and
    `ExternalAnalysisTask` are
    updated to pass the required parameters (`hotValueCollectCount`,
    `subStringColName`,
    `rowCount2`) to the template.
    
    Scope note: this PR intentionally targets non-partition OLAP full
    analyze and SQL-based
    external full analyze. Partition analyze keeps using the existing
    partition-level stats
    and merge path, and default HMS metadata analyze keeps using
    HMS-provided metadata without
    scanning table rows; those paths do not derive or publish hot values in
    this PR.
    
    ### Release note
    
    SQL-based full statistics collection now also collects hot value
    (frequent value)
    information, matching the behavior of sample-based collection for the
    covered paths.
    
    ---------
    
    Co-authored-by: Copilot <[email protected]>
---
 .../apache/doris/statistics/BaseAnalysisTask.java  |  25 +++-
 .../doris/statistics/ExternalAnalysisTask.java     |   6 +-
 .../apache/doris/statistics/OlapAnalysisTask.java  |   6 +-
 .../doris/statistics/HMSAnalysisTaskTest.java      |  72 +++++----
 .../doris/statistics/OlapAnalysisTaskTest.java     | 122 ++++++++++++++-
 .../distinct_split/disitinct_split.out             |  57 ++++---
 .../suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy      |  14 +-
 .../statistics/test_full_analyze_hot_value.groovy  | 166 +++++++++++++++++++++
 .../suites/statistics/test_hot_value.groovy        |  20 ++-
 9 files changed, 415 insertions(+), 73 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index f382cb8350c..852ec0e6fde 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -73,8 +73,11 @@ public abstract class BaseAnalysisTask {
      */
     public static final String ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER = 
"ANALYZE_SKIP_LONG_STRING_COLUMN";
 
-    protected static final String FULL_ANALYZE_TEMPLATE =
-            "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
+    protected static final String FULL_ANALYZE_TEMPLATE = "WITH cte1 AS ("
+            +     "SELECT `${colName}`${lengthAssert} "
+            +     "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}), "
+            + "cte2 AS ("
+            +     "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS 
`id`, "
             +     "${catalogId} AS `catalog_id`, "
             +     "${dbId} AS `db_id`, "
             +     "${tblId} AS `tbl_id`, "
@@ -87,10 +90,20 @@ public abstract class BaseAnalysisTask {
             +     "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS 
`min`, "
             +     "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS 
`max`, "
             +     "${dataSizeFunction} AS `data_size`, "
-            +     "NOW() AS `update_time`, "
-            +     "null as `hot_value` "
-            + "FROM (SELECT `${colName}`${lengthAssert} "
-            +     "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}) 
__lc_t";
+            +     "NOW() "
+            + "FROM cte1), "
+            + "cte3 AS ("
+            +     "SELECT IFNULL(GROUP_CONCAT(CONCAT("
+            +         "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), 
\";\", \"\\\\;\"), "
+            +         "\" :\", ROUND(t.`count` / ${rowCount2}, 2)), \" ;\"), 
'') "
+            +         "as `hot_value` "
+            +     "FROM ("
+            +         "SELECT ${subStringColName} as `hash_value`, "
+            +         "MAX(`${colName}`) as `column_key`, "
+            +         "COUNT(1) AS `count` "
+            +         "FROM cte1 WHERE `${colName}` IS NOT NULL "
+            +         "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT 
${hotValueCollectCount}) t) "
+            + "SELECT * FROM cte2 CROSS JOIN cte3";
 
     protected static final String LINEAR_ANALYZE_TEMPLATE = "WITH cte1 AS ("
             +     "SELECT `${colName}`${lengthAssert} "
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
index b9a2525ac6f..8c4eccb1165 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
@@ -21,6 +21,7 @@ import org.apache.doris.common.DdlException;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.NotImplementedException;
 import org.apache.doris.datasource.ExternalTable;
+import org.apache.doris.qe.SessionVariable;
 
 import org.apache.commons.text.StringSubstitutor;
 
@@ -59,9 +60,10 @@ public class ExternalAnalysisTask extends BaseAnalysisTask {
     protected void doFull() throws Exception {
         StringBuilder sb = new StringBuilder();
         Map<String, String> params = buildSqlParams();
-        params.put("min", getMinFunction());
-        params.put("max", getMaxFunction());
         params.put("dataSizeFunction", getDataSizeFunction(col, false));
+        params.put("hotValueCollectCount", 
String.valueOf(SessionVariable.getHotValueCollectCount()));
+        params.put("subStringColName", getStringTypeColName(col));
+        params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}` 
IS NOT NULL)");
         if (LOG.isDebugEnabled()) {
             LOG.debug("Will do full collection for column {}", col.getName());
         }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index bd0c5d94f12..4312b31208f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -329,7 +329,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
         if (StatisticsUtil.enablePartitionAnalyze() && 
tbl.isPartitionedTable()) {
             doPartitionTable();
         } else {
-            StringSubstitutor stringSubstitutor = new 
StringSubstitutor(buildSqlParams());
+            Map<String, String> params = buildSqlParams();
+            params.put("hotValueCollectCount", 
String.valueOf(SessionVariable.getHotValueCollectCount()));
+            params.put("subStringColName", getStringTypeColName(col));
+            params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE 
`${colName}` IS NOT NULL)");
+            StringSubstitutor stringSubstitutor = new 
StringSubstitutor(params);
             runQuery(stringSubstitutor.replace(FULL_ANALYZE_TEMPLATE));
         }
     }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
index faecabe7469..0df615319f1 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@@ -24,6 +24,7 @@ import org.apache.doris.catalog.PrimitiveType;
 import org.apache.doris.common.Pair;
 import org.apache.doris.datasource.CatalogIf;
 import org.apache.doris.datasource.hive.HMSExternalTable;
+import org.apache.doris.qe.SessionVariable;
 import org.apache.doris.statistics.util.StatisticsUtil;
 
 import com.google.common.collect.ImmutableList;
@@ -31,6 +32,7 @@ import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
+import org.mockito.MockedStatic;
 import org.mockito.Mockito;
 
 import java.util.ArrayList;
@@ -156,34 +158,48 @@ public class HMSAnalysisTaskTest {
         Mockito.when(databaseIf.getFullName()).thenReturn("default");
         
Mockito.when(tableIf.getPartitionNames()).thenReturn(ImmutableSet.of("date=20230101/hour=12"));
 
-        HMSAnalysisTask task = Mockito.spy(new HMSAnalysisTask());
-        Mockito.doAnswer(invocation -> {
-            String sql = invocation.getArgument(0);
-            Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-', 
'hour') AS `id`, "
-                    + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, "
-                    + "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS `part_id`, "
-                    + "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, "
-                    + "COUNT(1) - COUNT(`hour`) AS `null_count`, "
-                    + "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS 
`min`, "
-                    + "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS 
`max`, "
-                    + "COUNT(1) * 4 AS `data_size`, NOW() AS `update_time`, "
-                    + "null as `hot_value` FROM (SELECT `hour` FROM 
`hms`.`default`.`test` ) __lc_t", sql);
-            return null;
-        }).when(task).runQuery(Mockito.anyString());
-
-        task.col = new Column("hour", PrimitiveType.INT);
-        task.tbl = tableIf;
-        task.catalog = catalogIf;
-        task.db = databaseIf;
-        task.setTable(tableIf);
-
-        AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
-        analysisInfoBuilder.setColName("hour");
-        analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
-        analysisInfoBuilder.setUsingSqlForExternalTable(true);
-        task.info = analysisInfoBuilder.build();
-
-        task.doExecute();
+        try (MockedStatic<SessionVariable> mockedSessionVariable = 
Mockito.mockStatic(SessionVariable.class)) {
+            
mockedSessionVariable.when(SessionVariable::getHotValueCollectCount).thenReturn(10);
+
+            HMSAnalysisTask task = Mockito.spy(new HMSAnalysisTask());
+            Mockito.doAnswer(invocation -> {
+                String sql = invocation.getArgument(0);
+                Assertions.assertEquals("WITH cte1 AS (SELECT `hour` "
+                        + "FROM `hms`.`default`.`test` ), "
+                        + "cte2 AS (SELECT CONCAT(30001, '-', -1, '-', 'hour') 
AS `id`, "
+                        + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, "
+                        + "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS 
`part_id`, "
+                        + "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, "
+                        + "COUNT(1) - COUNT(`hour`) AS `null_count`, "
+                        + "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS 
`min`, "
+                        + "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS 
`max`, "
+                        + "COUNT(1) * 4 AS `data_size`, NOW() FROM cte1), "
+                        + "cte3 AS (SELECT IFNULL(GROUP_CONCAT(CONCAT("
+                        + "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"), 
\";\", \"\\\\;\"), "
+                        + "\" :\", ROUND(t.`count` / "
+                        + "(SELECT COUNT(1) FROM cte1 WHERE `hour` IS NOT 
NULL), 2)), \" ;\"), '') "
+                        + "as `hot_value` FROM (SELECT `hour` as `hash_value`, 
"
+                        + "MAX(`hour`) as `column_key`, COUNT(1) AS `count` "
+                        + "FROM cte1 WHERE `hour` IS NOT NULL "
+                        + "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT 
10) t) "
+                        + "SELECT * FROM cte2 CROSS JOIN cte3", sql);
+                return null;
+            }).when(task).runQuery(Mockito.anyString());
+
+            task.col = new Column("hour", PrimitiveType.INT);
+            task.tbl = tableIf;
+            task.catalog = catalogIf;
+            task.db = databaseIf;
+            task.setTable(tableIf);
+
+            AnalysisInfoBuilder analysisInfoBuilder = new 
AnalysisInfoBuilder();
+            analysisInfoBuilder.setColName("hour");
+            analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
+            analysisInfoBuilder.setUsingSqlForExternalTable(true);
+            task.info = analysisInfoBuilder.build();
+
+            task.doExecute();
+        }
     }
 
     @SuppressWarnings("unchecked")
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 27515f16670..2c93ad7dd9a 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -39,6 +39,7 @@ import org.apache.doris.common.AnalysisException;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.Pair;
 import org.apache.doris.datasource.CatalogIf;
+import org.apache.doris.qe.SessionVariable;
 import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod;
 import org.apache.doris.statistics.AnalysisInfo.JobType;
 import org.apache.doris.statistics.util.StatisticsUtil;
@@ -698,9 +699,9 @@ public class OlapAnalysisTaskTest {
                         + 
BaseAnalysisTask.ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER + "') AS `__lc`");
         StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
         String sql = 
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
-        Assertions.assertTrue(sql.contains("FROM (SELECT `s`, assert_true("), 
sql);
+        Assertions.assertTrue(sql.contains("WITH cte1 AS (SELECT `s`, 
assert_true("), sql);
         Assertions.assertTrue(sql.contains("IS NULL OR LENGTH(`s`) <= 1024"), 
sql);
-        Assertions.assertTrue(sql.endsWith(") __lc_t"), sql);
+        Assertions.assertTrue(sql.contains("FROM cte1), cte3 AS ("), sql);
     }
 
     @Test
@@ -724,7 +725,120 @@ public class OlapAnalysisTaskTest {
         StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
         String sql = 
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
         Assertions.assertFalse(sql.contains("assert_true"), sql);
-        Assertions.assertTrue(sql.contains("FROM (SELECT `id` FROM 
`internal`.`db1`.`tbl1`"), sql);
-        Assertions.assertTrue(sql.endsWith(") __lc_t"), sql);
+        Assertions.assertTrue(sql.contains("WITH cte1 AS (SELECT `id` FROM 
`internal`.`db1`.`tbl1`"), sql);
+        Assertions.assertTrue(sql.contains("FROM cte1), cte3 AS ("), sql);
+
+    }
+
+    @Test
+    public void testFullAnalyzeTemplateSql() {
+        Map<String, String> params = new HashMap<>();
+        params.put("catalogId", "0");
+        params.put("dbId", "1");
+        params.put("tblId", "2");
+        params.put("idxId", "3");
+        params.put("colId", "col1");
+        params.put("colName", "col1");
+        params.put("dataSizeFunction", "SUM(LENGTH(`col1`))");
+        params.put("catalogName", "internal");
+        params.put("dbName", "db1");
+        params.put("tblName", "tbl1");
+        params.put("index", "");
+        params.put("hotValueCollectCount", "10");
+        params.put("subStringColName", "`col1`");
+        params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `col1` IS 
NOT NULL)");
+        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        String sql = 
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
+        Assertions.assertTrue(sql.startsWith("WITH cte1 AS ("));
+        Assertions.assertTrue(sql.contains("cte3 AS (SELECT 
IFNULL(GROUP_CONCAT"));
+        Assertions.assertTrue(sql.contains("as `hot_value`"));
+        Assertions.assertTrue(sql.contains("CROSS JOIN cte3"));
+        Assertions.assertTrue(sql.contains("LIMIT 10"));
+        Assertions.assertTrue(sql.contains("GROUP BY `hash_value` ORDER BY 
`count` DESC"));
+        Assertions.assertFalse(sql.contains("null as `hot_value`"));
+    }
+
+    @Test
+    public void testDoFullHotValue() throws Exception {
+        CatalogIf catalogIf = Mockito.mock(CatalogIf.class);
+        DatabaseIf databaseIf = Mockito.mock(DatabaseIf.class);
+        OlapTable tableIf = Mockito.mock(OlapTable.class);
+        Mockito.when(tableIf.getId()).thenReturn(30001L);
+        Mockito.when(tableIf.getName()).thenReturn("testTbl");
+        Mockito.when(catalogIf.getId()).thenReturn(10001L);
+        Mockito.when(catalogIf.getName()).thenReturn("catalogName");
+        Mockito.when(databaseIf.getId()).thenReturn(20001L);
+        Mockito.when(databaseIf.getFullName()).thenReturn("testDb");
+
+        try (MockedStatic<StatisticsUtil> mockedStatisticsUtil = 
Mockito.mockStatic(
+                StatisticsUtil.class, Mockito.CALLS_REAL_METHODS);
+                MockedStatic<SessionVariable> mockedSessionVariable = 
Mockito.mockStatic(SessionVariable.class)) {
+            
mockedStatisticsUtil.when(StatisticsUtil::enablePartitionAnalyze).thenReturn(false);
+            
mockedSessionVariable.when(SessionVariable::getHotValueCollectCount).thenReturn(10);
+
+            OlapAnalysisTask task = Mockito.spy(new OlapAnalysisTask());
+            Mockito.doAnswer(invocation -> {
+                String sql = invocation.getArgument(0);
+                Assertions.assertTrue(sql.startsWith("WITH cte1 AS (SELECT 
`testCol` "
+                        + "FROM `catalogName`.`testDb`.`testTbl` "), sql);
+                Assertions.assertTrue(sql.contains("cte3 AS (SELECT 
IFNULL(GROUP_CONCAT"), sql);
+                Assertions.assertTrue(sql.contains("`testCol` as 
`hash_value`"), sql);
+                Assertions.assertTrue(sql.contains("LIMIT 10"), sql);
+                Assertions.assertTrue(sql.contains("CROSS JOIN cte3"), sql);
+                Assertions.assertFalse(sql.contains("null as `hot_value`"), 
sql);
+                return null;
+            }).when(task).runQuery(Mockito.anyString());
+
+            task.col = new Column("testCol", 
Type.fromPrimitiveType(PrimitiveType.INT),
+                true, null, null, null);
+            task.tbl = tableIf;
+            AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
+            builder.setJobType(AnalysisInfo.JobType.MANUAL);
+            builder.setColName("testCol");
+            task.info = builder.build();
+            task.catalog = catalogIf;
+            task.db = databaseIf;
+            task.doFull();
+        }
+    }
+
+    @Test
+    public void testDoFullHotValueStringColumn() throws Exception {
+        CatalogIf catalogIf = Mockito.mock(CatalogIf.class);
+        DatabaseIf databaseIf = Mockito.mock(DatabaseIf.class);
+        OlapTable tableIf = Mockito.mock(OlapTable.class);
+        Mockito.when(tableIf.getId()).thenReturn(30001L);
+        Mockito.when(tableIf.getName()).thenReturn("testTbl");
+        Mockito.when(catalogIf.getId()).thenReturn(10001L);
+        Mockito.when(catalogIf.getName()).thenReturn("catalogName");
+        Mockito.when(databaseIf.getId()).thenReturn(20001L);
+        Mockito.when(databaseIf.getFullName()).thenReturn("testDb");
+
+        try (MockedStatic<StatisticsUtil> mockedStatisticsUtil = 
Mockito.mockStatic(
+                StatisticsUtil.class, Mockito.CALLS_REAL_METHODS);
+                MockedStatic<SessionVariable> mockedSessionVariable = 
Mockito.mockStatic(SessionVariable.class)) {
+            
mockedStatisticsUtil.when(StatisticsUtil::enablePartitionAnalyze).thenReturn(false);
+            
mockedSessionVariable.when(SessionVariable::getHotValueCollectCount).thenReturn(10);
+
+            OlapAnalysisTask task = Mockito.spy(new OlapAnalysisTask());
+            Mockito.doAnswer(invocation -> {
+                String sql = invocation.getArgument(0);
+                Assertions.assertTrue(sql.contains(
+                        "xxhash_64(SUBSTRING(CAST(`strCol` AS STRING), 1, 
1024)) as `hash_value`"), sql);
+                Assertions.assertTrue(sql.contains("MAX(`strCol`) as 
`column_key`"), sql);
+                return null;
+            }).when(task).runQuery(Mockito.anyString());
+
+            task.col = new Column("strCol", 
Type.fromPrimitiveType(PrimitiveType.STRING),
+                true, null, null, null);
+            task.tbl = tableIf;
+            AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
+            builder.setJobType(AnalysisInfo.JobType.MANUAL);
+            builder.setColName("strCol");
+            task.info = builder.build();
+            task.catalog = catalogIf;
+            task.db = databaseIf;
+            task.doFull();
+        }
     }
 }
diff --git 
a/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out 
b/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out
index b239f4d54f8..9a18cab3879 100644
--- a/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out
+++ b/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out
@@ -369,11 +369,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 ------hashAgg[DISTINCT_GLOBAL]
 --------hashAgg[DISTINCT_LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ------hashAgg[DISTINCT_GLOBAL]
 --------hashAgg[DISTINCT_LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !multi_count_mulitcols_with_gby --
 PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -384,11 +386,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !three_count_mulitcols_without_gby --
 PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -400,15 +404,18 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 --------hashAgg[DISTINCT_GLOBAL]
 ----------hashAgg[DISTINCT_LOCAL]
 ------------hashAgg[GLOBAL]
---------------PhysicalCteConsumer ( cteId=CTEId#0 )
+--------------hashAgg[LOCAL]
+----------------PhysicalCteConsumer ( cteId=CTEId#0 )
 --------hashAgg[DISTINCT_GLOBAL]
 ----------hashAgg[DISTINCT_LOCAL]
 ------------hashAgg[GLOBAL]
---------------PhysicalCteConsumer ( cteId=CTEId#0 )
+--------------hashAgg[LOCAL]
+----------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ------hashAgg[DISTINCT_GLOBAL]
 --------hashAgg[DISTINCT_LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !four_count_mulitcols_with_gby --
 PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -421,19 +428,23 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 ----------hashAgg[GLOBAL]
 ------------hashAgg[LOCAL]
 --------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ----------hashAgg[GLOBAL]
 ------------hashAgg[LOCAL]
 --------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
 --------hashAgg[GLOBAL]
 ----------hashAgg[LOCAL]
 ------------hashAgg[GLOBAL]
---------------PhysicalCteConsumer ( cteId=CTEId#0 )
+--------------hashAgg[LOCAL]
+----------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !has_other_func --
 PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -467,11 +478,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 ----------hashAgg[GLOBAL]
 ------------hashAgg[LOCAL]
 --------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ----------hashAgg[GLOBAL]
 ------------hashAgg[LOCAL]
 --------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !multi_count_with_gby --
 PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -482,11 +495,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !multi_sum_with_gby --
 PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -497,11 +512,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !sum_count_with_gby --
 PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -512,10 +529,12 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[LOCAL]
 ----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
 ------hashAgg[GLOBAL]
 --------hashAgg[GLOBAL]
-----------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------hashAgg[LOCAL]
+------------PhysicalCteConsumer ( cteId=CTEId#0 )
 
 -- !has_grouping --
 PhysicalResultSink
diff --git a/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy 
b/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy
index 92341f4b2c8..43ee187f7dc 100644
--- a/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy
+++ b/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy
@@ -96,7 +96,7 @@ suite ("q_4_1_r1") {
 
     sql """analyze table lineorder_flat with sync;"""
     sql """alter table lineorder_flat modify column LO_ORDERDATE set stats 
('row_count'='8');"""
-    sql """alter table lineorder_flat modify column a1 set stats 
('row_count'='1');"""
+    sql """alter table lineorder_flat modify column a1 set stats 
('row_count'='8');"""
 
 
     sql """set enable_stats=true;"""
@@ -105,9 +105,9 @@ suite ("q_4_1_r1") {
             SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
             FROM lineorder_flat
             WHERE
-            C_REGION = 'AMERICA'
-            AND S_REGION = 'AMERICA'
-            AND P_MFGR IN ('MFGR#1', 'MFGR#2')
+            C_REGION = 'region'
+            AND S_REGION = 'region'
+            AND P_MFGR IN ('mfgr')
             GROUP BY YEAR, C_NATION
             ORDER BY YEAR ASC, C_NATION ASC;""", "lineorder_mv")
 
@@ -116,9 +116,9 @@ suite ("q_4_1_r1") {
                 SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
                 FROM lineorder_flat
                 WHERE
-                C_REGION = 'AMERICA'
-                AND S_REGION = 'AMERICA'
-                AND P_MFGR IN ('MFGR#1', 'MFGR#2')
+                C_REGION = 'region'
+                AND S_REGION = 'region'
+                AND P_MFGR IN ('mfgr')
                 GROUP BY YEAR, C_NATION
                 ORDER BY YEAR ASC, C_NATION ASC;"""
 }
diff --git 
a/regression-test/suites/statistics/test_full_analyze_hot_value.groovy 
b/regression-test/suites/statistics/test_full_analyze_hot_value.groovy
new file mode 100644
index 00000000000..92211e69d05
--- /dev/null
+++ b/regression-test/suites/statistics/test_full_analyze_hot_value.groovy
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_full_analyze_hot_value") {
+
+    sql """drop database if exists test_full_analyze_hot_value"""
+    sql """create database test_full_analyze_hot_value"""
+    sql """use test_full_analyze_hot_value"""
+    sql """set global enable_auto_analyze=false"""
+
+    // Test 1: Full analyze collects hot_value for varchar column with skewed 
distribution
+    sql """drop table if exists full_hot_skew"""
+    sql """CREATE TABLE full_hot_skew (
+            key1 int NULL,
+            value1 varchar(25) NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`key1`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+    // Insert 100 rows: value1 has 2 values, "0" and "1", each appearing 50 
times
+    sql """insert into full_hot_skew select number, number % 2 from 
numbers("number"="100")"""
+
+    sql """analyze table full_hot_skew with sync"""
+    def result = sql """show column stats full_hot_skew(value1)"""
+    logger.info("Test1 result: " + result)
+    assertEquals(1, result.size())
+    assertEquals("100.0", result[0][2])
+    // Full analyze should now collect hot_value
+    assertTrue(result[0][17].contains(":"), "Full analyze should collect 
hot_value, but got " + result[0][17])
+    String[] hotValues = result[0][17].split(";")
+    assertEquals(2, hotValues.length)
+    assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() == 
"'0':0.5")
+    assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() == 
"'0':0.5")
+
+    // Verify cached stats also have hot_value
+    result = sql """show column cached stats full_hot_skew(value1)"""
+    logger.info("Test1 cached result: " + result)
+    assertEquals(1, result.size())
+    hotValues = result[0][17].split(";")
+    assertEquals(2, hotValues.length)
+    assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() == 
"'0':0.5")
+    assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() == 
"'0':0.5")
+
+    // Test 2: Full analyze collects hot_value for int column
+    result = sql """show column stats full_hot_skew(key1)"""
+    logger.info("Test2 result: " + result)
+    assertEquals(1, result.size())
+    assertEquals("100.0", result[0][2])
+    // key1 has 100 unique values, top 10 will each have proportion 0.01 -> 
ROUND to 0.01
+    assertTrue(result[0][17].contains(":"), "Full analyze should collect 
hot_value for int column")
+
+    // Test 3: Full analyze with special characters in values
+    sql """drop table if exists full_hot_special"""
+    sql """CREATE TABLE full_hot_special (
+            key1 int NULL,
+            value1 varchar(50) NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`key1`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+    sql """insert into full_hot_special select number, " : ;a" from 
numbers("number"="100")"""
+
+    sql """analyze table full_hot_special with sync"""
+    result = sql """show column stats full_hot_special(value1)"""
+    logger.info("Test3 result: " + result)
+    assertEquals(1, result.size())
+    // All 100 rows have the same value " : ;a", so it should appear with 
ratio 1.0
+    assertEquals("' : ;a':1.0", result[0][17])
+
+    // Test 4: Full analyze then sample analyze, hot_value should be updated
+    sql """drop stats full_hot_skew"""
+    sql """analyze table full_hot_skew with sample rows 400 with sync"""
+    result = sql """show column stats full_hot_skew(value1)"""
+    logger.info("Test4 result: " + result)
+    assertEquals(1, result.size())
+    assertTrue(result[0][17].contains(":"), "Sample analyze should also 
collect hot_value")
+    hotValues = result[0][17].split(";")
+    assertEquals(2, hotValues.length)
+    assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() == 
"'0':0.5")
+    assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() == 
"'0':0.5")
+
+    // Test 5: Verify full analyze produces same hot_value as sample analyze 
for same data
+    sql """drop stats full_hot_skew"""
+    sql """analyze table full_hot_skew with sync"""
+    def fullResult = sql """show column stats full_hot_skew(value1)"""
+    logger.info("Test5 full result: " + fullResult)
+    assertEquals(1, fullResult.size())
+    assertTrue(fullResult[0][17].contains(":"))
+    def fullParts = fullResult[0][17].split(";").collect { it.trim() }.sort()
+
+    sql """drop stats full_hot_skew"""
+    sql """analyze table full_hot_skew with sample rows 40000 with sync"""
+    def sampleResult = sql """show column stats full_hot_skew(value1)"""
+    logger.info("Test5 sample result: " + sampleResult)
+    assertEquals(1, sampleResult.size())
+    assertTrue(sampleResult[0][17].contains(":"))
+    // Both full and sample should produce the same hot_value entries (order 
may differ)
+    def sampleParts = sampleResult[0][17].split(";").collect { it.trim() 
}.sort()
+    assertEquals(fullParts, sampleParts)
+
+    // Test 6: Full analyze on empty table should produce an empty hot_value 
string
+    sql """drop table if exists full_hot_empty"""
+    sql """CREATE TABLE full_hot_empty (
+            key1 int NULL,
+            value1 varchar(25) NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`key1`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+    sql """analyze table full_hot_empty with sync"""
+    result = sql """show column stats full_hot_empty(value1)"""
+    logger.info("Test6 empty table result: " + result)
+    assertEquals(1, result.size())
+    assertEquals("0.0", result[0][2])
+    assertEquals("''", result[0][17])
+
+    // Test 7: Full analyze on all-NULL column should produce an empty 
hot_value string
+    sql """drop table if exists full_hot_all_null"""
+    sql """CREATE TABLE full_hot_all_null (
+            key1 int NULL,
+            value1 varchar(25) NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`key1`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+    sql """insert into full_hot_all_null select number, null from 
numbers("number"="100")"""
+    sql """analyze table full_hot_all_null with sync"""
+    result = sql """show column stats full_hot_all_null(value1)"""
+    logger.info("Test7 all-null result: " + result)
+    assertEquals(1, result.size())
+    assertEquals("100.0", result[0][2])
+    assertEquals("100.0", result[0][4])
+    assertEquals("''", result[0][17])
+
+    sql """drop database if exists test_full_analyze_hot_value"""
+}
diff --git a/regression-test/suites/statistics/test_hot_value.groovy 
b/regression-test/suites/statistics/test_hot_value.groovy
index c4021f17555..4d8ee5643df 100644
--- a/regression-test/suites/statistics/test_hot_value.groovy
+++ b/regression-test/suites/statistics/test_hot_value.groovy
@@ -81,28 +81,36 @@ suite("test_hot_value") {
     wait_row_count_reported("test_hot_value", "test1", 0, 4, "10000")
     wait_row_count_reported("test_hot_value", "test2", 0, 4, "10000")
     sql """analyze table test1 with sync"""
-    logger.info("1. memo plan ")
     explain {
         sql("memo plan select * from test1")
-        contains "hotValues=(null)"
+        contains "hotValues=("
+        notContains "hotValues=(null)"
     }
     def result = sql """show column stats test1(key1)"""
     assertEquals(1, result.size())
     assertEquals("10000.0", result[0][2])
-    assertEquals("null", result[0][17])
+    assertTrue(result[0][17].contains(":"))
     result = sql """show column stats test1(value1)"""
     logger.info("0. result " + result)
     assertEquals(1, result.size())
     assertEquals("10000.0", result[0][2])
-    assertEquals("null", result[0][17])
+    String[] fullHotValues = result[0][17].split(";")
+    logger.info("0.1 fullHotValues " + result[0][17])
+    assertEquals(2, fullHotValues.length)
+    assertTrue(fullHotValues[0].trim() == "'1':0.5" || fullHotValues[0].trim() 
== "'0':0.5")
+    assertTrue(fullHotValues[1].trim() == "'1':0.5" || fullHotValues[1].trim() 
== "'0':0.5")
     result = sql """show column cached stats test1(key1)"""
     assertEquals(1, result.size())
     assertEquals("10000.0", result[0][2])
-    assertEquals("null", result[0][17])
+    // Full analyze collects top values for high-NDV key columns.
+    assertTrue(result[0][17].contains(":"))
     result = sql """show column cached stats test1(value1)"""
     assertEquals(1, result.size())
     assertEquals("10000.0", result[0][2])
-    assertEquals("null", result[0][17])
+    fullHotValues = result[0][17].split(";")
+    assertEquals(2, fullHotValues.length)
+    assertTrue(fullHotValues[0].trim() == "'1':0.5" || fullHotValues[0].trim() 
== "'0':0.5")
+    assertTrue(fullHotValues[1].trim() == "'1':0.5" || fullHotValues[1].trim() 
== "'0':0.5")
     sql """drop stats test1"""
     sql """analyze table test1 with sample rows 400 with sync"""
     result = sql """show column stats test1(key1)"""


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to