morningman commented on code in PR #25175:
URL: https://github.com/apache/doris/pull/25175#discussion_r1351812538
##########
fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java:
##########
@@ -142,6 +142,8 @@ default int getBaseColumnIdxByName(String colName) {
Map<String, Set<String>> findReAnalyzeNeededPartitions();
+ List<Long> getChunkSizes();
Review Comment:
Add comment for these new interface
##########
fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java:
##########
@@ -58,14 +58,15 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "${idxId} AS idx_id, "
+ "'${colId}' AS col_id, "
+ "NULL AS part_id, "
- + "${countExpr} AS row_count, "
- + "NDV(`${colName}`) AS ndv, "
- + "${nullCountExpr} AS null_count, "
+ + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ + "case when NDV(`${colName}`)/count('${colName}') < 0.3 then
NDV(`${colName}`) "
Review Comment:
this factor should be defined some where
##########
fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java:
##########
@@ -396,4 +396,9 @@ public Map<String, Set<String>>
findReAnalyzeNeededPartitions() {
partitions.add("Dummy Partition");
return
getBaseSchema().stream().collect(Collectors.toMap(Column::getName, k ->
partitions));
}
+
+ @Override
+ public List<Long> getChunkSizes() {
+ return null;
Review Comment:
Better not return "null"
##########
fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java:
##########
@@ -277,38 +277,53 @@ private Map<String, String> buildTableStatsParams(String
partId) {
commonParams.put("catalogName", catalog.getName());
commonParams.put("dbName", db.getFullName());
commonParams.put("tblName", tbl.getName());
- commonParams.put("countExpr", getCountExpression());
+ commonParams.put("sampleExpr", getSampleExpression());
+ commonParams.put("scaleFactor", getSampleScaleFactor());
if (col != null) {
commonParams.put("type", col.getType().toString());
}
commonParams.put("lastAnalyzeTimeInMs",
String.valueOf(System.currentTimeMillis()));
return commonParams;
}
- protected String getCountExpression() {
- if (info.samplePercent > 0) {
- return String.format("ROUND(COUNT(1) * 100 / %d)",
info.samplePercent);
- } else {
- return "COUNT(1)";
+ protected String getSampleExpression() {
+ if (tableSample == null) {
+ return "";
}
- }
-
- protected String getNullCountExpression() {
- if (info.samplePercent > 0) {
- return String.format("ROUND(SUM(CASE WHEN `${colName}` IS NULL
THEN 1 ELSE 0 END) * 100 / %d)",
- info.samplePercent);
+ if (tableSample.isPercent()) {
+ return String.format("TABLESAMPLE(%d PERCENT)",
tableSample.getSampleValue());
} else {
- return "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END)";
+ return String.format("TABLESAMPLE(%d ROWS)",
tableSample.getSampleValue());
}
}
- protected String getDataSizeFunction(Column column) {
- String originFunction = super.getDataSizeFunction(column);
- if (info.samplePercent > 0 && !isPartitionOnly) {
- return String.format("ROUND((%s) * 100 / %d)", originFunction,
info.samplePercent);
+ protected String getSampleScaleFactor() {
Review Comment:
Add some comment to explain the logic in code, better give some example
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]