This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new a380f5d2228 [enchement](utf8)import enable_text_validate_utf8 session
var (#45537) (#46070)
a380f5d2228 is described below
commit a380f5d2228517492b863e6998c05093842b599f
Author: daidai <[email protected]>
AuthorDate: Sat Dec 28 10:05:03 2024 +0800
[enchement](utf8)import enable_text_validate_utf8 session var (#45537)
(#46070)
bp #45537
---
be/src/util/utf8_check.cpp | 7 ++
be/src/util/utf8_check.h | 4 +
be/src/vec/exec/format/csv/csv_reader.cpp | 6 +-
.../scripts/create_preinstalled_scripts/run72.hql | 31 +++++++
.../text/utf8_check/utf8_check_fail.csv | 5 ++
.../doris/datasource/hive/source/HiveScanNode.java | 2 +
.../java/org/apache/doris/qe/SessionVariable.java | 9 ++
.../ExternalFileTableValuedFunction.java | 2 +
gensrc/thrift/PlanNodes.thrift | 2 +
.../external_table_p0/hive/test_utf8_check.out | 55 ++++++++++++
.../external_table_p0/hive/test_utf8_check.groovy | 100 +++++++++++++++++++++
11 files changed, 220 insertions(+), 3 deletions(-)
diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp
index 5355b901420..f90c27e5e91 100644
--- a/be/src/util/utf8_check.cpp
+++ b/be/src/util/utf8_check.cpp
@@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) {
return validate_utf8_naive(src, len);
}
#endif
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t
len) {
+ if (params.__isset.file_attributes &&
!params.file_attributes.enable_text_validate_utf8) {
+ return true;
+ }
+ return validate_utf8(src, len);
+}
} // namespace doris
diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h
index 4214e186b71..7e9b7a2a9de 100644
--- a/be/src/util/utf8_check.h
+++ b/be/src/util/utf8_check.h
@@ -17,6 +17,8 @@
#pragma once
+#include <gen_cpp/PlanNodes_types.h>
+
#include <cstddef>
namespace doris {
@@ -25,4 +27,6 @@ namespace doris {
bool validate_utf8(const char* src, size_t len);
// check utf8 use naive c++
bool validate_utf8_naive(const char* data, size_t len);
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t
len);
} // namespace doris
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 1fc3bbad294..77a5b65d512 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -715,7 +715,7 @@ Status CsvReader::_fill_empty_line(Block* block,
std::vector<MutableColumnPtr>&
}
Status CsvReader::_validate_line(const Slice& line, bool* success) {
- if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
+ if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) {
if (!_is_load) {
return Status::InternalError<false>("Only support csv data in utf8
codec");
} else {
@@ -951,7 +951,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
return Status::InternalError<false>(
"The first line is empty, can not parse column numbers");
}
- if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)),
size)) {
+ if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const
char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8
codec");
}
ptr = _remove_bom(ptr, size);
@@ -968,7 +968,7 @@ Status
CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
if (size == 0) {
return Status::InternalError<false>("The first line is empty, can not
parse column names");
}
- if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)),
size)) {
+ if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const
char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8
codec");
}
ptr = _remove_bom(ptr, size);
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
new file mode 100644
index 00000000000..1ab754b5042
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
@@ -0,0 +1,31 @@
+CREATE TABLE invalid_utf8_data (
+ id INT,
+ corrupted_data STRING,
+ string_data1 STRING,
+ string_data2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ','
+LINES TERMINATED BY '\n'
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+CREATE TABLE invalid_utf8_data2 (
+ id INT,
+ corrupted_data STRING,
+ string_data1 STRING,
+ string_data2 STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+ "separatorChar" = ",",
+ "quoteChar" = "\"",
+ "escapeChar" = "\\"
+)
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+
+msck repair table invalid_utf8_data;
+msck repair table invalid_utf8_data2;
+
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
new file mode 100644
index 00000000000..391cd493660
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
@@ -0,0 +1,5 @@
+1,�,AAB,helloworld
+2,��,AAB,helloworld
+2,���,AAB,helloworld
+4,����,AAB,helloworld
+5,�����,AAB,helloworld
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 02906494b03..dcabd11358c 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -452,6 +452,8 @@ public class HiveScanNode extends FileQueryScanNode {
TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
fileAttributes.setHeaderType("");
+ fileAttributes.setEnableTextValidateUtf8(
+
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) {
fileAttributes.setTrimDoubleQuotes(true);
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index e9dd9ec5822..f996b538257 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -665,6 +665,8 @@ public class SessionVariable implements Serializable,
Writable {
*/
public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE =
"enable_auto_create_when_overwrite";
+ public static final String ENABLE_TEXT_VALIDATE_UTF8 =
"enable_text_validate_utf8";
+
/**
* If set false, user couldn't submit analyze SQL and FE won't allocate
any related resources.
*/
@@ -2219,6 +2221,13 @@ public class SessionVariable implements Serializable,
Writable {
})
public boolean enableAutoCreateWhenOverwrite = false;
+ @VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true,
description = {
+ "对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。",
+ "For text type file reading, whether to enable utf8 encoding
check."
+ + "non-utf8 characters will be displayed as garbled
characters."
+ })
+ public boolean enableTextValidateUtf8 = true;
+
@VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward =
true, description = {
"跳过检查 transactional hive 版本文件 '_orc_acid_version.'",
"Skip checking transactional hive version file
'_orc_acid_version.'"
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
index 1f65921832b..cb1a2d89c5d 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
@@ -304,6 +304,8 @@ public abstract class ExternalFileTableValuedFunction
extends TableValuedFunctio
fileAttributes.setHeaderType(this.headerType);
fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
fileAttributes.setSkipLines(skipLines);
+ fileAttributes.setEnableTextValidateUtf8(
+
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) {
fileAttributes.setJsonRoot(jsonRoot);
fileAttributes.setJsonpaths(jsonPaths);
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index 1b873787765..7ccb12b3331 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -284,6 +284,8 @@ struct TFileAttributes {
10: optional bool trim_double_quotes;
// csv skip line num, only used when csv header_type is not set.
11: optional i32 skip_lines;
+ //For text type file reading, whether to enable utf8 encoding
check.(Catalog && TVF)
+ 12: optional bool enable_text_validate_utf8 = true;
}
struct TIcebergDeleteFileDesc {
diff --git a/regression-test/data/external_table_p0/hive/test_utf8_check.out
b/regression-test/data/external_table_p0/hive/test_utf8_check.out
new file mode 100644
index 00000000000..7557e789d49
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/test_utf8_check.out
@@ -0,0 +1,55 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !1 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !2 --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+c3 text Yes false \N NONE
+c4 text Yes false \N NONE
+
+-- !3 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !4 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !1 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !2 --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+c3 text Yes false \N NONE
+c4 text Yes false \N NONE
+
+-- !3 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
+-- !4 --
+1 � AAB helloworld
+2 �� AAB helloworld
+2 ��� AAB helloworld
+4 ���� AAB helloworld
+5 ����� AAB helloworld
+
diff --git
a/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
new file mode 100644
index 00000000000..aa26fdede73
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive")
{
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("diable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive2","hive3"]) {
+
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String catalog_name = "${hivePrefix}_test_utf8_check"
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+ def hdfsUserName = "doris"
+ String hdfs_port = context.config.otherConfigs.get(hivePrefix +
"HdfsPort")
+ def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ "type"="hms",
+ 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+ );"""
+ sql """use `${catalog_name}`.`default`"""
+
+
+ sql """ set enable_text_validate_utf8 = true; """
+
+ test {
+ sql """ select * from invalid_utf8_data """
+ exception """Only support csv data in utf8 codec"""
+ }
+
+
+ test {
+ sql """ select * from invalid_utf8_data2; """
+ exception """Only support csv data in utf8 codec"""
+ }
+
+
+ def uri = "${defaultFS}" +
"/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv"
+
+
+ test {
+ sql """ desc function HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",")"""
+ exception """Only support csv data in utf8 codec"""
+ }
+
+ test {
+ sql """select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",")"""
+ exception """Only support csv data in utf8 codec"""
+ }
+
+
+ sql """ set enable_text_validate_utf8 = false; """
+
+ qt_1 """select * from invalid_utf8_data order by id """
+
+ qt_2 """ desc function HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",")"""
+
+
+ qt_3 """select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "csv",
+ "column_separator"=",") order by c1"""
+ qt_4 """select * from invalid_utf8_data2 order by id """
+
+
+ }
+
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]