This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new c3d81cb3a0d [opt](stats) Use escape rather than base64 for min/max
value #27746 (#27748)
c3d81cb3a0d is described below
commit c3d81cb3a0d84db1b006c7499df4909bcc8277a5
Author: AKIRA <[email protected]>
AuthorDate: Wed Nov 29 20:59:02 2023 +0800
[opt](stats) Use escape rather than base64 for min/max value #27746 (#27748)
---
.../org/apache/doris/statistics/BaseAnalysisTask.java | 6 +++---
.../java/org/apache/doris/statistics/ColStatsData.java | 16 +++-------------
.../org/apache/doris/statistics/ColumnStatistic.java | 4 ----
.../org/apache/doris/statistics/OlapAnalysisTask.java | 4 ++--
.../apache/doris/statistics/StatisticsRepository.java | 4 ++--
.../org/apache/doris/statistics/util/StatisticsUtil.java | 4 +++-
.../apache/doris/statistics/BaseAnalysisTaskTest.java | 4 ++--
.../apache/doris/statistics/OlapAnalysisTaskTest.java | 6 +++---
.../java/org/apache/doris/statistics/StatsMockUtil.java | 2 +-
.../apache/doris/statistics/util/StatisticsUtilTest.java | 8 ++++++++
10 files changed, 27 insertions(+), 31 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 824e3f74abd..4c0e07ce6cb 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -252,7 +252,7 @@ public abstract class BaseAnalysisTask {
protected String getMinFunction() {
if (tableSample == null) {
- return "to_base64(CAST(MIN(`${colName}`) as ${type})) ";
+ return "CAST(MIN(`${colName}`) as ${type}) ";
} else {
// Min value is not accurate while sample, so set it to NULL to
avoid optimizer generate bad plan.
return "NULL";
@@ -276,7 +276,7 @@ public abstract class BaseAnalysisTask {
// Max value is not accurate while sample, so set it to NULL to avoid
optimizer generate bad plan.
protected String getMaxFunction() {
if (tableSample == null) {
- return "to_base64(CAST(MAX(`${colName}`) as ${type})) ";
+ return "CAST(MAX(`${colName}`) as ${type}) ";
} else {
return "NULL";
}
@@ -315,7 +315,7 @@ public abstract class BaseAnalysisTask {
long startTime = System.currentTimeMillis();
try (AutoCloseConnectContext a =
StatisticsUtil.buildConnectContext()) {
stmtExecutor = new StmtExecutor(a.connectContext, sql);
- ColStatsData colStatsData = new
ColStatsData(stmtExecutor.executeInternalQuery().get(0), needEncode);
+ ColStatsData colStatsData = new
ColStatsData(stmtExecutor.executeInternalQuery().get(0));
job.appendBuf(this, Collections.singletonList(colStatsData));
} finally {
LOG.debug("End cost time in secs: " + (System.currentTimeMillis()
- startTime) / 1000);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index fa7546abd5b..7878a065488 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -21,8 +21,6 @@ import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.annotations.VisibleForTesting;
-import java.nio.charset.StandardCharsets;
-import java.util.Base64;
import java.util.StringJoiner;
/**
@@ -56,8 +54,6 @@ public class ColStatsData {
public final String updateTime;
- public final boolean needEncode;
-
@VisibleForTesting
public ColStatsData() {
statsId = new StatsId();
@@ -68,10 +64,9 @@ public class ColStatsData {
maxLit = null;
dataSizeInBytes = 0;
updateTime = null;
- needEncode = true;
}
- public ColStatsData(ResultRow row, boolean needEncode) {
+ public ColStatsData(ResultRow row) {
this.statsId = new StatsId(row);
this.count = (long) Double.parseDouble(row.get(7));
this.ndv = (long) Double.parseDouble(row.getWithDefault(8, "0"));
@@ -80,7 +75,6 @@ public class ColStatsData {
this.maxLit = row.get(11);
this.dataSizeInBytes = (long)
Double.parseDouble(row.getWithDefault(12, "0"));
this.updateTime = row.get(13);
- this.needEncode = needEncode;
}
public String toSQL(boolean roundByParentheses) {
@@ -93,12 +87,8 @@ public class ColStatsData {
sj.add(String.valueOf(count));
sj.add(String.valueOf(ndv));
sj.add(String.valueOf(nullCount));
- sj.add(minLit == null ? "NULL" : needEncode
- ? "'" +
Base64.getEncoder().encodeToString(minLit.getBytes(StandardCharsets.UTF_8)) +
"'"
- : "'" + minLit + "'");
- sj.add(maxLit == null ? "NULL" : needEncode
- ? "'" +
Base64.getEncoder().encodeToString(maxLit.getBytes(StandardCharsets.UTF_8)) +
"'"
- : "'" + maxLit + "'");
+ sj.add(minLit == null ? "NULL" : "'" +
StatisticsUtil.escapeSQL(minLit) + "'");
+ sj.add(maxLit == null ? "NULL" : "'" +
StatisticsUtil.escapeSQL(maxLit) + "'");
sj.add(String.valueOf(dataSizeInBytes));
sj.add(StatisticsUtil.quote(updateTime));
return sj.toString();
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
index eefaf7badeb..5ea1f9097bb 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
@@ -31,8 +31,6 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;
-import java.nio.charset.StandardCharsets;
-import java.util.Base64;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -175,7 +173,6 @@ public class ColumnStatistic {
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
- min = new String(Base64.getDecoder().decode(min),
StandardCharsets.UTF_8);
// Internal catalog get the min/max value using a separate SQL,
// and the value is already encoded by base64. Need to handle
internal and external catalog separately.
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID &&
min.equalsIgnoreCase("NULL")) {
@@ -193,7 +190,6 @@ public class ColumnStatistic {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
- max = new String(Base64.getDecoder().decode(max),
StandardCharsets.UTF_8);
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID &&
max.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
} else {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index 04a763b6884..45ccdbb0d68 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -92,8 +92,8 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
// Get basic stats, including min and max.
ResultRow basicStats = collectBasicStat(r);
long rowCount = tbl.getRowCount();
- String min = StatisticsUtil.encodeValue(basicStats, 0);
- String max = StatisticsUtil.encodeValue(basicStats, 1);
+ String min = StatisticsUtil.escapeSQL(basicStats.get(0));
+ String max = StatisticsUtil.escapeSQL(basicStats.get(1));
boolean limitFlag = false;
long rowsToSample = pair.second;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
index 12ca6b4aa1f..29e11ac75ad 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
@@ -269,8 +269,8 @@ public class StatisticsRepository {
params.put("count", String.valueOf(columnStatistic.count));
params.put("ndv", String.valueOf(columnStatistic.ndv));
params.put("nullCount", String.valueOf(columnStatistic.numNulls));
- params.put("min", StatisticsUtil.encodeString(min));
- params.put("max", StatisticsUtil.encodeString(max));
+ params.put("min", StatisticsUtil.escapeSQL(min));
+ params.put("max", StatisticsUtil.escapeSQL(max));
params.put("dataSize", String.valueOf(columnStatistic.dataSize));
if (partitionIds.isEmpty()) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 66b13ff87f1..3f9abcad5f9 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -786,7 +786,9 @@ public class StatisticsUtil {
if (str == null) {
return null;
}
- return org.apache.commons.lang3.StringUtils.replace(str, "'", "''");
+ return str.replace("'", "''")
+ .replace("\\", "\\\\")
+ .replace("\"", "\"\"");
}
public static boolean isExternalTable(String catalogName, String dbName,
String tblName) {
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
index fe81c055e0d..187c4d207df 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
@@ -42,14 +42,14 @@ public class BaseAnalysisTaskTest {
Assertions.assertEquals("SUM(t1.count) * 4", dataSizeFunction);
String minFunction = olapAnalysisTask.getMinFunction();
- Assertions.assertEquals("to_base64(CAST(MIN(`${colName}`) as ${type}))
", minFunction);
+ Assertions.assertEquals("CAST(MIN(`${colName}`) as ${type}) ",
minFunction);
olapAnalysisTask.tableSample = new TableSample(true, 20L);
minFunction = olapAnalysisTask.getMinFunction();
Assertions.assertEquals("NULL", minFunction);
olapAnalysisTask.tableSample = null;
String maxFunction = olapAnalysisTask.getMaxFunction();
- Assertions.assertEquals("to_base64(CAST(MAX(`${colName}`) as ${type}))
", maxFunction);
+ Assertions.assertEquals("CAST(MAX(`${colName}`) as ${type}) ",
maxFunction);
olapAnalysisTask.tableSample = new TableSample(true, 20L);
maxFunction = olapAnalysisTask.getMaxFunction();
Assertions.assertEquals("NULL", maxFunction);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index c174795b36b..7b7894e54b4 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -151,7 +151,7 @@ public class OlapAnalysisTaskTest {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
- Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) -
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) *
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL,
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'M [...]
+ Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) -
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) *
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL,
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...]
return;
}
};
@@ -218,7 +218,7 @@ public class OlapAnalysisTaskTest {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
- Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-',
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`,
ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS
NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS
`max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM
`catalogName`.`${dbName}`.`${tblName}` limit [...]
+ Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-',
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`,
ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS
NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`,
SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM
`catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
return;
}
};
@@ -292,7 +292,7 @@ public class OlapAnalysisTaskTest {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
- Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) -
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) *
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL,
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'M [...]
+ Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) -
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) *
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL,
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...]
return;
}
};
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
index 84e1112d216..45c78f665b0 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java
@@ -40,7 +40,7 @@ public class StatsMockUtil {
add("0");
add("10");
// 11
- add("MTE=");
+ add("11");
add("12");
add(String.valueOf(System.currentTimeMillis()));
}};
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
index 2c0854dcf21..c827a7d1690 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java
@@ -142,4 +142,12 @@ public class StatisticsUtilTest {
.encodeToString("a".getBytes(StandardCharsets.UTF_8)),
StatisticsUtil.encodeValue(row, 1));
Assertions.assertEquals("NULL", StatisticsUtil.encodeValue(row, 2));
}
+
+ @Test
+ public void testEscape() {
+ // \'"
+ String origin = "\\'\"";
+ // \\''""
+ Assertions.assertEquals("\\\\''\"\"",
StatisticsUtil.escapeSQL(origin));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]