This is an automated email from the ASF dual-hosted git repository.
morrySnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d77856dc715 [improvement](statistics) Collect hot value during full
statistics analysis (#62435)
d77856dc715 is described below
commit d77856dc71585da845d3646cf9c20d2fd94c0c48
Author: yujun <[email protected]>
AuthorDate: Mon May 18 18:10:09 2026 +0800
[improvement](statistics) Collect hot value during full statistics analysis
(#62435)
### What problem does this PR solve?
Problem Summary: Previously, SQL-based full statistics collection
(`ANALYZE TABLE ... WITH SYNC`)
hardcoded `null as hot_value` in `FULL_ANALYZE_TEMPLATE`, meaning hot
values (frequent
values) were only collected during sample-based analysis. This made
SQL-based full analysis
produce less complete statistics than sample analysis.
This change rewrites `FULL_ANALYZE_TEMPLATE` to use a CTE-based
structure (matching
`LINEAR_ANALYZE_TEMPLATE` and `DUJ1_ANALYZE_TEMPLATE` patterns) that
computes hot values
via GROUP BY + TOP-N aggregation. Both `OlapAnalysisTask` and
`ExternalAnalysisTask` are
updated to pass the required parameters (`hotValueCollectCount`,
`subStringColName`,
`rowCount2`) to the template.
Scope note: this PR intentionally targets non-partition OLAP full
analyze and SQL-based
external full analyze. Partition analyze keeps using the existing
partition-level stats
and merge path, and default HMS metadata analyze keeps using
HMS-provided metadata without
scanning table rows; those paths do not derive or publish hot values in
this PR.
### Release note
SQL-based full statistics collection now also collects hot value
(frequent value)
information, matching the behavior of sample-based collection for the
covered paths.
---------
Co-authored-by: Copilot <[email protected]>
---
.../apache/doris/statistics/BaseAnalysisTask.java | 25 +++-
.../doris/statistics/ExternalAnalysisTask.java | 6 +-
.../apache/doris/statistics/OlapAnalysisTask.java | 6 +-
.../doris/statistics/HMSAnalysisTaskTest.java | 72 +++++----
.../doris/statistics/OlapAnalysisTaskTest.java | 122 ++++++++++++++-
.../distinct_split/disitinct_split.out | 57 ++++---
.../suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy | 14 +-
.../statistics/test_full_analyze_hot_value.groovy | 166 +++++++++++++++++++++
.../suites/statistics/test_hot_value.groovy | 20 ++-
9 files changed, 415 insertions(+), 73 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index f382cb8350c..852ec0e6fde 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -73,8 +73,11 @@ public abstract class BaseAnalysisTask {
*/
public static final String ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER =
"ANALYZE_SKIP_LONG_STRING_COLUMN";
- protected static final String FULL_ANALYZE_TEMPLATE =
- "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
+ protected static final String FULL_ANALYZE_TEMPLATE = "WITH cte1 AS ("
+ + "SELECT `${colName}`${lengthAssert} "
+ + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}), "
+ + "cte2 AS ("
+ + "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS
`id`, "
+ "${catalogId} AS `catalog_id`, "
+ "${dbId} AS `db_id`, "
+ "${tblId} AS `tbl_id`, "
@@ -87,10 +90,20 @@ public abstract class BaseAnalysisTask {
+ "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS
`min`, "
+ "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS
`max`, "
+ "${dataSizeFunction} AS `data_size`, "
- + "NOW() AS `update_time`, "
- + "null as `hot_value` "
- + "FROM (SELECT `${colName}`${lengthAssert} "
- + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index})
__lc_t";
+ + "NOW() "
+ + "FROM cte1), "
+ + "cte3 AS ("
+ + "SELECT IFNULL(GROUP_CONCAT(CONCAT("
+ + "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"),
\";\", \"\\\\;\"), "
+ + "\" :\", ROUND(t.`count` / ${rowCount2}, 2)), \" ;\"),
'') "
+ + "as `hot_value` "
+ + "FROM ("
+ + "SELECT ${subStringColName} as `hash_value`, "
+ + "MAX(`${colName}`) as `column_key`, "
+ + "COUNT(1) AS `count` "
+ + "FROM cte1 WHERE `${colName}` IS NOT NULL "
+ + "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT
${hotValueCollectCount}) t) "
+ + "SELECT * FROM cte2 CROSS JOIN cte3";
protected static final String LINEAR_ANALYZE_TEMPLATE = "WITH cte1 AS ("
+ "SELECT `${colName}`${lengthAssert} "
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
index b9a2525ac6f..8c4eccb1165 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
@@ -21,6 +21,7 @@ import org.apache.doris.common.DdlException;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.NotImplementedException;
import org.apache.doris.datasource.ExternalTable;
+import org.apache.doris.qe.SessionVariable;
import org.apache.commons.text.StringSubstitutor;
@@ -59,9 +60,10 @@ public class ExternalAnalysisTask extends BaseAnalysisTask {
protected void doFull() throws Exception {
StringBuilder sb = new StringBuilder();
Map<String, String> params = buildSqlParams();
- params.put("min", getMinFunction());
- params.put("max", getMaxFunction());
params.put("dataSizeFunction", getDataSizeFunction(col, false));
+ params.put("hotValueCollectCount",
String.valueOf(SessionVariable.getHotValueCollectCount()));
+ params.put("subStringColName", getStringTypeColName(col));
+ params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `${colName}`
IS NOT NULL)");
if (LOG.isDebugEnabled()) {
LOG.debug("Will do full collection for column {}", col.getName());
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index bd0c5d94f12..4312b31208f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -329,7 +329,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
if (StatisticsUtil.enablePartitionAnalyze() &&
tbl.isPartitionedTable()) {
doPartitionTable();
} else {
- StringSubstitutor stringSubstitutor = new
StringSubstitutor(buildSqlParams());
+ Map<String, String> params = buildSqlParams();
+ params.put("hotValueCollectCount",
String.valueOf(SessionVariable.getHotValueCollectCount()));
+ params.put("subStringColName", getStringTypeColName(col));
+ params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE
`${colName}` IS NOT NULL)");
+ StringSubstitutor stringSubstitutor = new
StringSubstitutor(params);
runQuery(stringSubstitutor.replace(FULL_ANALYZE_TEMPLATE));
}
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
index faecabe7469..0df615319f1 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@@ -24,6 +24,7 @@ import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.common.Pair;
import org.apache.doris.datasource.CatalogIf;
import org.apache.doris.datasource.hive.HMSExternalTable;
+import org.apache.doris.qe.SessionVariable;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.collect.ImmutableList;
@@ -31,6 +32,7 @@ import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import org.mockito.MockedStatic;
import org.mockito.Mockito;
import java.util.ArrayList;
@@ -156,34 +158,48 @@ public class HMSAnalysisTaskTest {
Mockito.when(databaseIf.getFullName()).thenReturn("default");
Mockito.when(tableIf.getPartitionNames()).thenReturn(ImmutableSet.of("date=20230101/hour=12"));
- HMSAnalysisTask task = Mockito.spy(new HMSAnalysisTask());
- Mockito.doAnswer(invocation -> {
- String sql = invocation.getArgument(0);
- Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-',
'hour') AS `id`, "
- + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, "
- + "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS `part_id`, "
- + "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, "
- + "COUNT(1) - COUNT(`hour`) AS `null_count`, "
- + "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS
`min`, "
- + "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS
`max`, "
- + "COUNT(1) * 4 AS `data_size`, NOW() AS `update_time`, "
- + "null as `hot_value` FROM (SELECT `hour` FROM
`hms`.`default`.`test` ) __lc_t", sql);
- return null;
- }).when(task).runQuery(Mockito.anyString());
-
- task.col = new Column("hour", PrimitiveType.INT);
- task.tbl = tableIf;
- task.catalog = catalogIf;
- task.db = databaseIf;
- task.setTable(tableIf);
-
- AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
- analysisInfoBuilder.setColName("hour");
- analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
- analysisInfoBuilder.setUsingSqlForExternalTable(true);
- task.info = analysisInfoBuilder.build();
-
- task.doExecute();
+ try (MockedStatic<SessionVariable> mockedSessionVariable =
Mockito.mockStatic(SessionVariable.class)) {
+
mockedSessionVariable.when(SessionVariable::getHotValueCollectCount).thenReturn(10);
+
+ HMSAnalysisTask task = Mockito.spy(new HMSAnalysisTask());
+ Mockito.doAnswer(invocation -> {
+ String sql = invocation.getArgument(0);
+ Assertions.assertEquals("WITH cte1 AS (SELECT `hour` "
+ + "FROM `hms`.`default`.`test` ), "
+ + "cte2 AS (SELECT CONCAT(30001, '-', -1, '-', 'hour')
AS `id`, "
+ + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, "
+ + "-1 AS `idx_id`, 'hour' AS `col_id`, NULL AS
`part_id`, "
+ + "COUNT(1) AS `row_count`, NDV(`hour`) AS `ndv`, "
+ + "COUNT(1) - COUNT(`hour`) AS `null_count`, "
+ + "SUBSTRING(CAST(MIN(`hour`) AS STRING), 1, 1024) AS
`min`, "
+ + "SUBSTRING(CAST(MAX(`hour`) AS STRING), 1, 1024) AS
`max`, "
+ + "COUNT(1) * 4 AS `data_size`, NOW() FROM cte1), "
+ + "cte3 AS (SELECT IFNULL(GROUP_CONCAT(CONCAT("
+ + "REPLACE(REPLACE(t.`column_key`, \":\", \"\\\\:\"),
\";\", \"\\\\;\"), "
+ + "\" :\", ROUND(t.`count` / "
+ + "(SELECT COUNT(1) FROM cte1 WHERE `hour` IS NOT
NULL), 2)), \" ;\"), '') "
+ + "as `hot_value` FROM (SELECT `hour` as `hash_value`,
"
+ + "MAX(`hour`) as `column_key`, COUNT(1) AS `count` "
+ + "FROM cte1 WHERE `hour` IS NOT NULL "
+ + "GROUP BY `hash_value` ORDER BY `count` DESC LIMIT
10) t) "
+ + "SELECT * FROM cte2 CROSS JOIN cte3", sql);
+ return null;
+ }).when(task).runQuery(Mockito.anyString());
+
+ task.col = new Column("hour", PrimitiveType.INT);
+ task.tbl = tableIf;
+ task.catalog = catalogIf;
+ task.db = databaseIf;
+ task.setTable(tableIf);
+
+ AnalysisInfoBuilder analysisInfoBuilder = new
AnalysisInfoBuilder();
+ analysisInfoBuilder.setColName("hour");
+ analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
+ analysisInfoBuilder.setUsingSqlForExternalTable(true);
+ task.info = analysisInfoBuilder.build();
+
+ task.doExecute();
+ }
}
@SuppressWarnings("unchecked")
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 27515f16670..2c93ad7dd9a 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -39,6 +39,7 @@ import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
import org.apache.doris.datasource.CatalogIf;
+import org.apache.doris.qe.SessionVariable;
import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod;
import org.apache.doris.statistics.AnalysisInfo.JobType;
import org.apache.doris.statistics.util.StatisticsUtil;
@@ -698,9 +699,9 @@ public class OlapAnalysisTaskTest {
+
BaseAnalysisTask.ANALYZE_SKIP_LONG_STRING_COLUMN_MARKER + "') AS `__lc`");
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql =
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
- Assertions.assertTrue(sql.contains("FROM (SELECT `s`, assert_true("),
sql);
+ Assertions.assertTrue(sql.contains("WITH cte1 AS (SELECT `s`,
assert_true("), sql);
Assertions.assertTrue(sql.contains("IS NULL OR LENGTH(`s`) <= 1024"),
sql);
- Assertions.assertTrue(sql.endsWith(") __lc_t"), sql);
+ Assertions.assertTrue(sql.contains("FROM cte1), cte3 AS ("), sql);
}
@Test
@@ -724,7 +725,120 @@ public class OlapAnalysisTaskTest {
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql =
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
Assertions.assertFalse(sql.contains("assert_true"), sql);
- Assertions.assertTrue(sql.contains("FROM (SELECT `id` FROM
`internal`.`db1`.`tbl1`"), sql);
- Assertions.assertTrue(sql.endsWith(") __lc_t"), sql);
+ Assertions.assertTrue(sql.contains("WITH cte1 AS (SELECT `id` FROM
`internal`.`db1`.`tbl1`"), sql);
+ Assertions.assertTrue(sql.contains("FROM cte1), cte3 AS ("), sql);
+
+ }
+
+ @Test
+ public void testFullAnalyzeTemplateSql() {
+ Map<String, String> params = new HashMap<>();
+ params.put("catalogId", "0");
+ params.put("dbId", "1");
+ params.put("tblId", "2");
+ params.put("idxId", "3");
+ params.put("colId", "col1");
+ params.put("colName", "col1");
+ params.put("dataSizeFunction", "SUM(LENGTH(`col1`))");
+ params.put("catalogName", "internal");
+ params.put("dbName", "db1");
+ params.put("tblName", "tbl1");
+ params.put("index", "");
+ params.put("hotValueCollectCount", "10");
+ params.put("subStringColName", "`col1`");
+ params.put("rowCount2", "(SELECT COUNT(1) FROM cte1 WHERE `col1` IS
NOT NULL)");
+ StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+ String sql =
stringSubstitutor.replace(BaseAnalysisTask.FULL_ANALYZE_TEMPLATE);
+ Assertions.assertTrue(sql.startsWith("WITH cte1 AS ("));
+ Assertions.assertTrue(sql.contains("cte3 AS (SELECT
IFNULL(GROUP_CONCAT"));
+ Assertions.assertTrue(sql.contains("as `hot_value`"));
+ Assertions.assertTrue(sql.contains("CROSS JOIN cte3"));
+ Assertions.assertTrue(sql.contains("LIMIT 10"));
+ Assertions.assertTrue(sql.contains("GROUP BY `hash_value` ORDER BY
`count` DESC"));
+ Assertions.assertFalse(sql.contains("null as `hot_value`"));
+ }
+
+ @Test
+ public void testDoFullHotValue() throws Exception {
+ CatalogIf catalogIf = Mockito.mock(CatalogIf.class);
+ DatabaseIf databaseIf = Mockito.mock(DatabaseIf.class);
+ OlapTable tableIf = Mockito.mock(OlapTable.class);
+ Mockito.when(tableIf.getId()).thenReturn(30001L);
+ Mockito.when(tableIf.getName()).thenReturn("testTbl");
+ Mockito.when(catalogIf.getId()).thenReturn(10001L);
+ Mockito.when(catalogIf.getName()).thenReturn("catalogName");
+ Mockito.when(databaseIf.getId()).thenReturn(20001L);
+ Mockito.when(databaseIf.getFullName()).thenReturn("testDb");
+
+ try (MockedStatic<StatisticsUtil> mockedStatisticsUtil =
Mockito.mockStatic(
+ StatisticsUtil.class, Mockito.CALLS_REAL_METHODS);
+ MockedStatic<SessionVariable> mockedSessionVariable =
Mockito.mockStatic(SessionVariable.class)) {
+
mockedStatisticsUtil.when(StatisticsUtil::enablePartitionAnalyze).thenReturn(false);
+
mockedSessionVariable.when(SessionVariable::getHotValueCollectCount).thenReturn(10);
+
+ OlapAnalysisTask task = Mockito.spy(new OlapAnalysisTask());
+ Mockito.doAnswer(invocation -> {
+ String sql = invocation.getArgument(0);
+ Assertions.assertTrue(sql.startsWith("WITH cte1 AS (SELECT
`testCol` "
+ + "FROM `catalogName`.`testDb`.`testTbl` "), sql);
+ Assertions.assertTrue(sql.contains("cte3 AS (SELECT
IFNULL(GROUP_CONCAT"), sql);
+ Assertions.assertTrue(sql.contains("`testCol` as
`hash_value`"), sql);
+ Assertions.assertTrue(sql.contains("LIMIT 10"), sql);
+ Assertions.assertTrue(sql.contains("CROSS JOIN cte3"), sql);
+ Assertions.assertFalse(sql.contains("null as `hot_value`"),
sql);
+ return null;
+ }).when(task).runQuery(Mockito.anyString());
+
+ task.col = new Column("testCol",
Type.fromPrimitiveType(PrimitiveType.INT),
+ true, null, null, null);
+ task.tbl = tableIf;
+ AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
+ builder.setJobType(AnalysisInfo.JobType.MANUAL);
+ builder.setColName("testCol");
+ task.info = builder.build();
+ task.catalog = catalogIf;
+ task.db = databaseIf;
+ task.doFull();
+ }
+ }
+
+ @Test
+ public void testDoFullHotValueStringColumn() throws Exception {
+ CatalogIf catalogIf = Mockito.mock(CatalogIf.class);
+ DatabaseIf databaseIf = Mockito.mock(DatabaseIf.class);
+ OlapTable tableIf = Mockito.mock(OlapTable.class);
+ Mockito.when(tableIf.getId()).thenReturn(30001L);
+ Mockito.when(tableIf.getName()).thenReturn("testTbl");
+ Mockito.when(catalogIf.getId()).thenReturn(10001L);
+ Mockito.when(catalogIf.getName()).thenReturn("catalogName");
+ Mockito.when(databaseIf.getId()).thenReturn(20001L);
+ Mockito.when(databaseIf.getFullName()).thenReturn("testDb");
+
+ try (MockedStatic<StatisticsUtil> mockedStatisticsUtil =
Mockito.mockStatic(
+ StatisticsUtil.class, Mockito.CALLS_REAL_METHODS);
+ MockedStatic<SessionVariable> mockedSessionVariable =
Mockito.mockStatic(SessionVariable.class)) {
+
mockedStatisticsUtil.when(StatisticsUtil::enablePartitionAnalyze).thenReturn(false);
+
mockedSessionVariable.when(SessionVariable::getHotValueCollectCount).thenReturn(10);
+
+ OlapAnalysisTask task = Mockito.spy(new OlapAnalysisTask());
+ Mockito.doAnswer(invocation -> {
+ String sql = invocation.getArgument(0);
+ Assertions.assertTrue(sql.contains(
+ "xxhash_64(SUBSTRING(CAST(`strCol` AS STRING), 1,
1024)) as `hash_value`"), sql);
+ Assertions.assertTrue(sql.contains("MAX(`strCol`) as
`column_key`"), sql);
+ return null;
+ }).when(task).runQuery(Mockito.anyString());
+
+ task.col = new Column("strCol",
Type.fromPrimitiveType(PrimitiveType.STRING),
+ true, null, null, null);
+ task.tbl = tableIf;
+ AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
+ builder.setJobType(AnalysisInfo.JobType.MANUAL);
+ builder.setColName("strCol");
+ task.info = builder.build();
+ task.catalog = catalogIf;
+ task.db = databaseIf;
+ task.doFull();
+ }
}
}
diff --git
a/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out
b/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out
index b239f4d54f8..9a18cab3879 100644
--- a/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out
+++ b/regression-test/data/nereids_rules_p0/distinct_split/disitinct_split.out
@@ -369,11 +369,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
------hashAgg[DISTINCT_GLOBAL]
--------hashAgg[DISTINCT_LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
------hashAgg[DISTINCT_GLOBAL]
--------hashAgg[DISTINCT_LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !multi_count_mulitcols_with_gby --
PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -384,11 +386,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !three_count_mulitcols_without_gby --
PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -400,15 +404,18 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
--------hashAgg[DISTINCT_GLOBAL]
----------hashAgg[DISTINCT_LOCAL]
------------hashAgg[GLOBAL]
---------------PhysicalCteConsumer ( cteId=CTEId#0 )
+--------------hashAgg[LOCAL]
+----------------PhysicalCteConsumer ( cteId=CTEId#0 )
--------hashAgg[DISTINCT_GLOBAL]
----------hashAgg[DISTINCT_LOCAL]
------------hashAgg[GLOBAL]
---------------PhysicalCteConsumer ( cteId=CTEId#0 )
+--------------hashAgg[LOCAL]
+----------------PhysicalCteConsumer ( cteId=CTEId#0 )
------hashAgg[DISTINCT_GLOBAL]
--------hashAgg[DISTINCT_LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !four_count_mulitcols_with_gby --
PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -421,19 +428,23 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
----------hashAgg[GLOBAL]
------------hashAgg[LOCAL]
--------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
----------hashAgg[GLOBAL]
------------hashAgg[LOCAL]
--------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
--------hashAgg[GLOBAL]
----------hashAgg[LOCAL]
------------hashAgg[GLOBAL]
---------------PhysicalCteConsumer ( cteId=CTEId#0 )
+--------------hashAgg[LOCAL]
+----------------PhysicalCteConsumer ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !has_other_func --
PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -467,11 +478,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
----------hashAgg[GLOBAL]
------------hashAgg[LOCAL]
--------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
----------hashAgg[GLOBAL]
------------hashAgg[LOCAL]
--------------hashAgg[GLOBAL]
-----------------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------------hashAgg[LOCAL]
+------------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !multi_count_with_gby --
PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -482,11 +495,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !multi_sum_with_gby --
PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -497,11 +512,13 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !sum_count_with_gby --
PhysicalCteAnchor ( cteId=CTEId#0 )
@@ -512,10 +529,12 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[LOCAL]
----------hashAgg[GLOBAL]
-------------PhysicalCteConsumer ( cteId=CTEId#0 )
+------------hashAgg[LOCAL]
+--------------PhysicalCteConsumer ( cteId=CTEId#0 )
------hashAgg[GLOBAL]
--------hashAgg[GLOBAL]
-----------PhysicalCteConsumer ( cteId=CTEId#0 )
+----------hashAgg[LOCAL]
+------------PhysicalCteConsumer ( cteId=CTEId#0 )
-- !has_grouping --
PhysicalResultSink
diff --git a/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy
b/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy
index 92341f4b2c8..43ee187f7dc 100644
--- a/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy
+++ b/regression-test/suites/mv_p0/ssb/q_4_1_r1/q_4_1_r1.groovy
@@ -96,7 +96,7 @@ suite ("q_4_1_r1") {
sql """analyze table lineorder_flat with sync;"""
sql """alter table lineorder_flat modify column LO_ORDERDATE set stats
('row_count'='8');"""
- sql """alter table lineorder_flat modify column a1 set stats
('row_count'='1');"""
+ sql """alter table lineorder_flat modify column a1 set stats
('row_count'='8');"""
sql """set enable_stats=true;"""
@@ -105,9 +105,9 @@ suite ("q_4_1_r1") {
SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
WHERE
- C_REGION = 'AMERICA'
- AND S_REGION = 'AMERICA'
- AND P_MFGR IN ('MFGR#1', 'MFGR#2')
+ C_REGION = 'region'
+ AND S_REGION = 'region'
+ AND P_MFGR IN ('mfgr')
GROUP BY YEAR, C_NATION
ORDER BY YEAR ASC, C_NATION ASC;""", "lineorder_mv")
@@ -116,9 +116,9 @@ suite ("q_4_1_r1") {
SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
WHERE
- C_REGION = 'AMERICA'
- AND S_REGION = 'AMERICA'
- AND P_MFGR IN ('MFGR#1', 'MFGR#2')
+ C_REGION = 'region'
+ AND S_REGION = 'region'
+ AND P_MFGR IN ('mfgr')
GROUP BY YEAR, C_NATION
ORDER BY YEAR ASC, C_NATION ASC;"""
}
diff --git
a/regression-test/suites/statistics/test_full_analyze_hot_value.groovy
b/regression-test/suites/statistics/test_full_analyze_hot_value.groovy
new file mode 100644
index 00000000000..92211e69d05
--- /dev/null
+++ b/regression-test/suites/statistics/test_full_analyze_hot_value.groovy
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_full_analyze_hot_value") {
+
+ sql """drop database if exists test_full_analyze_hot_value"""
+ sql """create database test_full_analyze_hot_value"""
+ sql """use test_full_analyze_hot_value"""
+ sql """set global enable_auto_analyze=false"""
+
+ // Test 1: Full analyze collects hot_value for varchar column with skewed
distribution
+ sql """drop table if exists full_hot_skew"""
+ sql """CREATE TABLE full_hot_skew (
+ key1 int NULL,
+ value1 varchar(25) NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`key1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ // Insert 100 rows: value1 has 2 values, "0" and "1", each appearing 50
times
+ sql """insert into full_hot_skew select number, number % 2 from
numbers("number"="100")"""
+
+ sql """analyze table full_hot_skew with sync"""
+ def result = sql """show column stats full_hot_skew(value1)"""
+ logger.info("Test1 result: " + result)
+ assertEquals(1, result.size())
+ assertEquals("100.0", result[0][2])
+ // Full analyze should now collect hot_value
+ assertTrue(result[0][17].contains(":"), "Full analyze should collect
hot_value, but got " + result[0][17])
+ String[] hotValues = result[0][17].split(";")
+ assertEquals(2, hotValues.length)
+ assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() ==
"'0':0.5")
+ assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() ==
"'0':0.5")
+
+ // Verify cached stats also have hot_value
+ result = sql """show column cached stats full_hot_skew(value1)"""
+ logger.info("Test1 cached result: " + result)
+ assertEquals(1, result.size())
+ hotValues = result[0][17].split(";")
+ assertEquals(2, hotValues.length)
+ assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() ==
"'0':0.5")
+ assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() ==
"'0':0.5")
+
+ // Test 2: Full analyze collects hot_value for int column
+ result = sql """show column stats full_hot_skew(key1)"""
+ logger.info("Test2 result: " + result)
+ assertEquals(1, result.size())
+ assertEquals("100.0", result[0][2])
+ // key1 has 100 unique values, top 10 will each have proportion 0.01 ->
ROUND to 0.01
+ assertTrue(result[0][17].contains(":"), "Full analyze should collect
hot_value for int column")
+
+ // Test 3: Full analyze with special characters in values
+ sql """drop table if exists full_hot_special"""
+ sql """CREATE TABLE full_hot_special (
+ key1 int NULL,
+ value1 varchar(50) NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`key1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """insert into full_hot_special select number, " : ;a" from
numbers("number"="100")"""
+
+ sql """analyze table full_hot_special with sync"""
+ result = sql """show column stats full_hot_special(value1)"""
+ logger.info("Test3 result: " + result)
+ assertEquals(1, result.size())
+ // All 100 rows have the same value " : ;a", so it should appear with
ratio 1.0
+ assertEquals("' : ;a':1.0", result[0][17])
+
+ // Test 4: Full analyze then sample analyze, hot_value should be updated
+ sql """drop stats full_hot_skew"""
+ sql """analyze table full_hot_skew with sample rows 400 with sync"""
+ result = sql """show column stats full_hot_skew(value1)"""
+ logger.info("Test4 result: " + result)
+ assertEquals(1, result.size())
+ assertTrue(result[0][17].contains(":"), "Sample analyze should also
collect hot_value")
+ hotValues = result[0][17].split(";")
+ assertEquals(2, hotValues.length)
+ assertTrue(hotValues[0].trim() == "'1':0.5" || hotValues[0].trim() ==
"'0':0.5")
+ assertTrue(hotValues[1].trim() == "'1':0.5" || hotValues[1].trim() ==
"'0':0.5")
+
+ // Test 5: Verify full analyze produces same hot_value as sample analyze
for same data
+ sql """drop stats full_hot_skew"""
+ sql """analyze table full_hot_skew with sync"""
+ def fullResult = sql """show column stats full_hot_skew(value1)"""
+ logger.info("Test5 full result: " + fullResult)
+ assertEquals(1, fullResult.size())
+ assertTrue(fullResult[0][17].contains(":"))
+ def fullParts = fullResult[0][17].split(";").collect { it.trim() }.sort()
+
+ sql """drop stats full_hot_skew"""
+ sql """analyze table full_hot_skew with sample rows 40000 with sync"""
+ def sampleResult = sql """show column stats full_hot_skew(value1)"""
+ logger.info("Test5 sample result: " + sampleResult)
+ assertEquals(1, sampleResult.size())
+ assertTrue(sampleResult[0][17].contains(":"))
+ // Both full and sample should produce the same hot_value entries (order
may differ)
+ def sampleParts = sampleResult[0][17].split(";").collect { it.trim()
}.sort()
+ assertEquals(fullParts, sampleParts)
+
+ // Test 6: Full analyze on empty table should produce an empty hot_value
string
+ sql """drop table if exists full_hot_empty"""
+ sql """CREATE TABLE full_hot_empty (
+ key1 int NULL,
+ value1 varchar(25) NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`key1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """analyze table full_hot_empty with sync"""
+ result = sql """show column stats full_hot_empty(value1)"""
+ logger.info("Test6 empty table result: " + result)
+ assertEquals(1, result.size())
+ assertEquals("0.0", result[0][2])
+ assertEquals("''", result[0][17])
+
+ // Test 7: Full analyze on all-NULL column should produce an empty
hot_value string
+ sql """drop table if exists full_hot_all_null"""
+ sql """CREATE TABLE full_hot_all_null (
+ key1 int NULL,
+ value1 varchar(25) NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`key1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """insert into full_hot_all_null select number, null from
numbers("number"="100")"""
+ sql """analyze table full_hot_all_null with sync"""
+ result = sql """show column stats full_hot_all_null(value1)"""
+ logger.info("Test7 all-null result: " + result)
+ assertEquals(1, result.size())
+ assertEquals("100.0", result[0][2])
+ assertEquals("100.0", result[0][4])
+ assertEquals("''", result[0][17])
+
+ sql """drop database if exists test_full_analyze_hot_value"""
+}
diff --git a/regression-test/suites/statistics/test_hot_value.groovy
b/regression-test/suites/statistics/test_hot_value.groovy
index c4021f17555..4d8ee5643df 100644
--- a/regression-test/suites/statistics/test_hot_value.groovy
+++ b/regression-test/suites/statistics/test_hot_value.groovy
@@ -81,28 +81,36 @@ suite("test_hot_value") {
wait_row_count_reported("test_hot_value", "test1", 0, 4, "10000")
wait_row_count_reported("test_hot_value", "test2", 0, 4, "10000")
sql """analyze table test1 with sync"""
- logger.info("1. memo plan ")
explain {
sql("memo plan select * from test1")
- contains "hotValues=(null)"
+ contains "hotValues=("
+ notContains "hotValues=(null)"
}
def result = sql """show column stats test1(key1)"""
assertEquals(1, result.size())
assertEquals("10000.0", result[0][2])
- assertEquals("null", result[0][17])
+ assertTrue(result[0][17].contains(":"))
result = sql """show column stats test1(value1)"""
logger.info("0. result " + result)
assertEquals(1, result.size())
assertEquals("10000.0", result[0][2])
- assertEquals("null", result[0][17])
+ String[] fullHotValues = result[0][17].split(";")
+ logger.info("0.1 fullHotValues " + result[0][17])
+ assertEquals(2, fullHotValues.length)
+ assertTrue(fullHotValues[0].trim() == "'1':0.5" || fullHotValues[0].trim()
== "'0':0.5")
+ assertTrue(fullHotValues[1].trim() == "'1':0.5" || fullHotValues[1].trim()
== "'0':0.5")
result = sql """show column cached stats test1(key1)"""
assertEquals(1, result.size())
assertEquals("10000.0", result[0][2])
- assertEquals("null", result[0][17])
+ // Full analyze collects top values for high-NDV key columns.
+ assertTrue(result[0][17].contains(":"))
result = sql """show column cached stats test1(value1)"""
assertEquals(1, result.size())
assertEquals("10000.0", result[0][2])
- assertEquals("null", result[0][17])
+ fullHotValues = result[0][17].split(";")
+ assertEquals(2, fullHotValues.length)
+ assertTrue(fullHotValues[0].trim() == "'1':0.5" || fullHotValues[0].trim()
== "'0':0.5")
+ assertTrue(fullHotValues[1].trim() == "'1':0.5" || fullHotValues[1].trim()
== "'0':0.5")
sql """drop stats test1"""
sql """analyze table test1 with sample rows 400 with sync"""
result = sql """show column stats test1(key1)"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]