This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new ae1831156f7 [improvement](statistics)Use count as ndv for unique/agg
olap table single key column (#27186) (#27275)
ae1831156f7 is described below
commit ae1831156f7e754ef184f41eecd0b4337361abb6
Author: Jibing-Li <[email protected]>
AuthorDate: Mon Nov 20 17:16:43 2023 +0800
[improvement](statistics)Use count as ndv for unique/agg olap table single
key column (#27186) (#27275)
Single key column of unique/agg olap table has the same value of count and
ndv, for this kind of column,
don't need to calculate ndv, simply use count as ndv.
backport #27186
---
.../apache/doris/statistics/BaseAnalysisTask.java | 4 +--
.../apache/doris/statistics/HMSAnalysisTask.java | 1 +
.../apache/doris/statistics/OlapAnalysisTask.java | 40 +++++++++++++++++++---
.../doris/statistics/OlapAnalysisTaskTest.java | 2 +-
4 files changed, 39 insertions(+), 8 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index a278200e5c7..f3fa143b528 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -72,8 +72,8 @@ public abstract class BaseAnalysisTask {
+ "${idxId} AS `idx_id`, "
+ "'${colId}' AS `col_id`, "
+ "NULL AS `part_id`, "
- + "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, "
- + "ROUND(NDV(`${colName}`) * ${scaleFactor}) as `ndv`, "
+ + "${rowCount} AS `row_count`, "
+ + "${ndvFunction} as `ndv`, "
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) *
${scaleFactor}) AS `null_count`, "
+ "${min} AS `min`, "
+ "${max} AS `max`, "
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index 5be026e2acf..7bd540de2c7 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -145,6 +145,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
if (distributionColumns.size() == 1 &&
distributionColumns.contains(col.getName().toLowerCase())) {
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
+ params.put("ndvFunction", "ROUND(NDV(`${colName}`) *
${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index d7037580595..97cb10c520c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -17,6 +17,7 @@
package org.apache.doris.statistics;
+import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Partition;
@@ -129,21 +130,26 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
StringSubstitutor stringSubstitutor = new
StringSubstitutor(params);
String sql;
- // Single distribution column is not fit for DUJ1 estimator, use
linear estimator.
- Set<String> distributionColumns = tbl.getDistributionColumnNames();
- if (distributionColumns.size() == 1 &&
distributionColumns.contains(col.getName().toLowerCase())) {
+ if (useLinearAnalyzeTemplate()) {
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
+ // For single unique key, use count as ndv.
+ if (isSingleUniqueKey()) {
+ params.put("ndvFunction", String.valueOf(rowCount));
+ } else {
+ params.put("ndvFunction", "ROUND(NDV(`${colName}`) *
${scaleFactor})");
+ }
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
} else {
params.put("dataSizeFunction", getDataSizeFunction(col, true));
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample
[{}], scale factor [{}], "
- + "limited [{}], distribute column [{}], partition column
[{}], key column [{}]",
+ + "limited [{}], distribute column [{}], partition column
[{}], key column [{}], "
+ + "is single unique key [{}]",
col.getName(), params.get("rowCount"), rowsToSample,
params.get("scaleFactor"),
limitFlag, tbl.isDistributionColumn(col.getName()),
- tbl.isPartitionColumn(col.getName()), col.isKey());
+ tbl.isPartitionColumn(col.getName()), col.isKey(),
isSingleUniqueKey());
runQuery(sql, false);
}
}
@@ -278,4 +284,28 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
return sampleRows;
}
+
+ /**
+ * Check if the task should use linear analyze template.
+ * @return True for single unique key column and single distribution
column.
+ */
+ protected boolean useLinearAnalyzeTemplate() {
+ if (isSingleUniqueKey()) {
+ return true;
+ }
+ Set<String> distributionColumns = tbl.getDistributionColumnNames();
+ return distributionColumns.size() == 1 &&
distributionColumns.contains(col.getName().toLowerCase());
+ }
+
+ /**
+ * Check if the olap table has a single unique key.
+ * @return True if the table has a single unique/agg key. False otherwise.
+ */
+ protected boolean isSingleUniqueKey() {
+ int keysNum = ((OlapTable) tbl).getKeysNum();
+ KeysType keysType = ((OlapTable) tbl).getKeysType();
+ return col.isKey()
+ && keysNum == 1
+ && (keysType.equals(KeysType.UNIQUE_KEYS) ||
keysType.equals(KeysType.AGG_KEYS));
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 9437d2d0787..8e30519e8c4 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -218,7 +218,7 @@ public class OlapAnalysisTaskTest {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
- Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-',
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, ROUND(COUNT(1) * 5.0) AS
`row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN
`${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS
`min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW()
FROM `catalogName`.`${dbName}`. [...]
+ Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-',
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`,
ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS
NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS
`max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM
`catalogName`.`${dbName}`.`${tblName}` limit [...]
return;
}
};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]