This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new bbef3ec6903 [enchement](utf8)import enable_text_validate_utf8 session
var (#45537) (#46071)
bbef3ec6903 is described below
commit bbef3ec6903f33fdef989c25b2486f186d4f0059
Author: daidai <[email protected]>
AuthorDate: Fri Dec 27 16:56:34 2024 +0800
[enchement](utf8)import enable_text_validate_utf8 session var (#45537)
(#46071)
bp #46071
---
be/src/util/utf8_check.cpp | 7 ++
be/src/util/utf8_check.h | 4 +
be/src/vec/exec/format/csv/csv_reader.cpp | 6 +-
.../scripts/create_preinstalled_scripts/run72.hql | 31 +++++++
.../text/utf8_check/utf8_check_fail.csv | 5 ++
.../doris/datasource/hive/source/HiveScanNode.java | 4 +
.../java/org/apache/doris/qe/SessionVariable.java | 10 +++
.../ExternalFileTableValuedFunction.java | 2 +
gensrc/thrift/PlanNodes.thrift | 2 +
.../external_table_p0/hive/test_utf8_check.out | 55 ++++++++++++
.../external_table_p0/hive/test_utf8_check.groovy | 100 +++++++++++++++++++++
11 files changed, 223 insertions(+), 3 deletions(-)
diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp
index 5355b901420..f90c27e5e91 100644
--- a/be/src/util/utf8_check.cpp
+++ b/be/src/util/utf8_check.cpp
@@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) {
return validate_utf8_naive(src, len);
}
#endif
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t
len) {
+ if (params.__isset.file_attributes &&
!params.file_attributes.enable_text_validate_utf8) {
+ return true;
+ }
+ return validate_utf8(src, len);
+}
} // namespace doris
diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h
index 4214e186b71..7e9b7a2a9de 100644
--- a/be/src/util/utf8_check.h
+++ b/be/src/util/utf8_check.h
@@ -17,6 +17,8 @@
#pragma once
+#include <gen_cpp/PlanNodes_types.h>
+
#include <cstddef>
namespace doris {
@@ -25,4 +27,6 @@ namespace doris {
bool validate_utf8(const char* src, size_t len);
// check utf8 use naive c++
bool validate_utf8_naive(const char* data, size_t len);
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t
len);
} // namespace doris
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index b27bb050dc6..397095590dd 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -713,7 +713,7 @@ Status CsvReader::_fill_empty_line(Block* block,
std::vector<MutableColumnPtr>&
}
Status CsvReader::_validate_line(const Slice& line, bool* success) {
- if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
+ if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) {
if (!_is_load) {
return Status::InternalError<false>("Only support csv data in utf8
codec");
} else {
@@ -954,7 +954,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
return Status::InternalError<false>(
"The first line is empty, can not parse column numbers");
}
- if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)),
size)) {
+ if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const
char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8
codec");
}
ptr = _remove_bom(ptr, size);
@@ -971,7 +971,7 @@ Status
CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
if (size == 0) {
return Status::InternalError<false>("The first line is empty, can not
parse column names");
}
- if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)),
size)) {
+ if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const
char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8
codec");
}
ptr = _remove_bom(ptr, size);
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
new file mode 100644
index 00000000000..1ab754b5042
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
@@ -0,0 +1,31 @@
+CREATE TABLE invalid_utf8_data (
+ id INT,
+ corrupted_data STRING,
+ string_data1 STRING,
+ string_data2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ','
+LINES TERMINATED BY '\n'
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+CREATE TABLE invalid_utf8_data2 (
+ id INT,
+ corrupted_data STRING,
+ string_data1 STRING,
+ string_data2 STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+ "separatorChar" = ",",
+ "quoteChar" = "\"",
+ "escapeChar" = "\\"
+)
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+
+msck repair table invalid_utf8_data;
+msck repair table invalid_utf8_data2;
+
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
new file mode 100644
index 00000000000..391cd493660
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
@@ -0,0 +1,5 @@
+1,�,AAB,helloworld
+2,��,AAB,helloworld
+2,���,AAB,helloworld
+4,����,AAB,helloworld
+5,�����,AAB,helloworld
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 35b21c368ea..3a8ab722fb6 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -435,6 +435,8 @@ public class HiveScanNode extends FileQueryScanNode {
textParams.setNullFormat(HiveProperties.getNullFormat(table));
fileAttributes.setTextParams(textParams);
fileAttributes.setHeaderType("");
+ fileAttributes.setEnableTextValidateUtf8(
+
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if
(serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
// set set properties of OpenCSVSerde
@@ -451,6 +453,8 @@ public class HiveScanNode extends FileQueryScanNode {
if (textParams.isSetEnclose()) {
fileAttributes.setTrimDoubleQuotes(true);
}
+ fileAttributes.setEnableTextValidateUtf8(
+
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe"))
{
TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
textParams.setColumnSeparator("\t");
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 53b8423e0fe..ab565defe65 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -690,11 +690,14 @@ public class SessionVariable implements Serializable,
Writable {
*/
public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE =
"enable_auto_create_when_overwrite";
+
public static final String
ENABLE_ADAPTIVE_PIPELINE_TASK_SERIAL_READ_ON_LIMIT =
"enable_adaptive_pipeline_task_serial_read_on_limit";
public static final String ADAPTIVE_PIPELINE_TASK_SERIAL_READ_ON_LIMIT =
"adaptive_pipeline_task_serial_read_on_limit";
+ public static final String ENABLE_TEXT_VALIDATE_UTF8 =
"enable_text_validate_utf8";
+
/**
* If set false, user couldn't submit analyze SQL and FE won't allocate
any related resources.
*/
@@ -2298,6 +2301,13 @@ public class SessionVariable implements Serializable,
Writable {
})
public boolean enableAutoCreateWhenOverwrite = false;
+ @VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true,
description = {
+ "对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。",
+ "For text type file reading, whether to enable utf8 encoding
check."
+ + "non-utf8 characters will be displayed as garbled
characters."
+ })
+ public boolean enableTextValidateUtf8 = true;
+
@VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward =
true, description = {
"跳过检查 transactional hive 版本文件 '_orc_acid_version.'",
"Skip checking transactional hive version file
'_orc_acid_version.'"
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
index 55d046c2ed9..9031efd0dc2 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
@@ -305,6 +305,8 @@ public abstract class ExternalFileTableValuedFunction
extends TableValuedFunctio
fileAttributes.setHeaderType(this.headerType);
fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
fileAttributes.setSkipLines(skipLines);
+ fileAttributes.setEnableTextValidateUtf8(
+
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) {
fileAttributes.setJsonRoot(jsonRoot);
fileAttributes.setJsonpaths(jsonPaths);
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index 62e88621aeb..9aaa7076901 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -284,6 +284,8 @@ struct TFileAttributes {
10: optional bool trim_double_quotes;
// csv skip line num, only used when csv header_type is not set.
11: optional i32 skip_lines;
+ //For text type file reading, whether to enable utf8 encoding
check.(Catalog && TVF)
+ 12: optional bool enable_text_validate_utf8 = true;
// for cloud copy into
1001: optional bool ignore_csv_redundant_col;
}
diff --git a/regression-test/data/external_table_p0/hive/test_utf8_check.out
b/regression-test/data/external_table_p0/hive/test_utf8_check.out
new file mode 100644
index 00000000000..7557e789d49
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/test_utf8_check.out
@@ -0,0 +1,55 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !1 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !2 --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+c3 text Yes false \N NONE
+c4 text Yes false \N NONE
+
+-- !3 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !4 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !1 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !2 --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+c3 text Yes false \N NONE
+c4 text Yes false \N NONE
+
+-- !3 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !4 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
diff --git
a/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
new file mode 100644
index 00000000000..aa26fdede73
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive")
{
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("diable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive2","hive3"]) {
+
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String catalog_name = "${hivePrefix}_test_utf8_check"
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+ def hdfsUserName = "doris"
+ String hdfs_port = context.config.otherConfigs.get(hivePrefix +
"HdfsPort")
+ def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ "type"="hms",
+ 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+ );"""
+ sql """use `${catalog_name}`.`default`"""
+
+
+ sql """ set enable_text_validate_utf8 = true; """
+
+ test {
+ sql """ select * from invalid_utf8_data """
+ exception """Only support csv data in utf8 codec"""
+ }
+
+
+ test {
+ sql """ select * from invalid_utf8_data2; """
+ exception """Only support csv data in utf8 codec"""
+ }
+
+
+ def uri = "${defaultFS}" +
"/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv"
+
+
+ test {
+ sql """ desc function HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",")"""
+ exception """Only support csv data in utf8 codec"""
+ }
+
+ test {
+ sql """select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",")"""
+ exception """Only support csv data in utf8 codec"""
+ }
+
+
+ sql """ set enable_text_validate_utf8 = false; """
+
+ qt_1 """select * from invalid_utf8_data order by id """
+
+ qt_2 """ desc function HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",")"""
+
+
+ qt_3 """select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",") order by c1"""
+ qt_4 """select * from invalid_utf8_data2 order by id """
+
+
+ }
+
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]