This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new fe42ec9c104 [fix](tvf) Tvf supports to parse the enclose character in
csv files (#45407)
fe42ec9c104 is described below
commit fe42ec9c104aee83292714d2a4a55eaed5841bc0
Author: Tiewei Fang <[email protected]>
AuthorDate: Wed Dec 18 09:55:42 2024 +0800
[fix](tvf) Tvf supports to parse the enclose character in csv files (#45407)
### What problem does this PR solve?
Problem Summary:
Tvf supports to parse the enclose character in csv files
---
be/src/vec/exec/format/csv/csv_reader.cpp | 6 +-
.../doris/common/util/FileFormatConstants.java | 1 +
.../ExternalFileTableValuedFunction.java | 15 +++++
.../data/external_table_p0/tvf/enclose_csv.csv | 6 ++
.../tvf/test_local_tvf_enclose.out | 15 +++++
.../tvf/test_local_tvf_enclose.groovy | 72 ++++++++++++++++++++++
6 files changed, 114 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index bf0e543d650..b27bb050dc6 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -928,9 +928,13 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool*
is_parse_name) {
_trim_tailing_spaces, _trim_double_quotes, _value_separator,
_value_separator_length);
} else {
+ // If we pass `_file_slot_descs.size() - 1` to
EncloseCsvTextFieldSplitter, it will cause BE core dump
+ // because currently _file_slot_descs is an empty vector.
+ // The _file_slot_descs.size() is only used to reserve space,
+ // so it's ok to pass 0 to EncloseCsvLineReaderContext
text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderContext>(
_line_delimiter, _line_delimiter_length, _value_separator,
_value_separator_length,
- _file_slot_descs.size() - 1, _enclose, _escape, _keep_cr);
+ 0, _enclose, _escape, _keep_cr);
_fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
_trim_tailing_spaces, false,
std::static_pointer_cast<EncloseCsvLineReaderContext>(text_line_reader_ctx),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
b/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
index bdb2e97b9f2..7050bec9d77 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
@@ -50,6 +50,7 @@ public class FileFormatConstants {
public static final String PROP_COMPRESS = "compress";
public static final String PROP_COMPRESS_TYPE = "compress_type";
public static final String PROP_PATH_PARTITION_KEYS =
"path_partition_keys";
+ public static final String PROP_ENCLOSE = "enclose";
// decimal(p,s)
public static final Pattern DECIMAL_TYPE_PATTERN =
Pattern.compile("decimal\\((\\d+),(\\d+)\\)");
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
index 6f45a1cc0eb..55d046c2ed9 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
@@ -120,6 +120,7 @@ public abstract class ExternalFileTableValuedFunction
extends TableValuedFunctio
private TTextSerdeType textSerdeType = TTextSerdeType.JSON_TEXT_SERDE;
private String columnSeparator =
FileFormatConstants.DEFAULT_COLUMN_SEPARATOR;
private String lineDelimiter = FileFormatConstants.DEFAULT_LINE_DELIMITER;
+ private byte enclose = 0;
private String jsonRoot = "";
private String jsonPaths = "";
private boolean stripOuterArray;
@@ -231,6 +232,17 @@ public abstract class ExternalFileTableValuedFunction
extends TableValuedFunctio
}
lineDelimiter = Separator.convertSeparator(lineDelimiter);
+ String enclosedString = getOrDefaultAndRemove(copiedProps,
FileFormatConstants.PROP_ENCLOSE, "");
+ if (!Strings.isNullOrEmpty(enclosedString)) {
+ if (enclosedString.length() > 1) {
+ throw new AnalysisException("enclose should not be longer than
one byte.");
+ }
+ enclose = (byte) enclosedString.charAt(0);
+ if (enclose == 0) {
+ throw new AnalysisException("enclose should not be byte [0].");
+ }
+ }
+
jsonRoot = getOrDefaultAndRemove(copiedProps,
FileFormatConstants.PROP_JSON_ROOT, "");
jsonPaths = getOrDefaultAndRemove(copiedProps,
FileFormatConstants.PROP_JSON_PATHS, "");
readJsonByLine = Boolean.valueOf(
@@ -285,6 +297,9 @@ public abstract class ExternalFileTableValuedFunction
extends TableValuedFunctio
TFileTextScanRangeParams fileTextScanRangeParams = new
TFileTextScanRangeParams();
fileTextScanRangeParams.setColumnSeparator(this.columnSeparator);
fileTextScanRangeParams.setLineDelimiter(this.lineDelimiter);
+ if (enclose != 0) {
+ fileTextScanRangeParams.setEnclose(enclose);
+ }
fileAttributes.setTextParams(fileTextScanRangeParams);
if (this.fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN) {
fileAttributes.setHeaderType(this.headerType);
diff --git a/regression-test/data/external_table_p0/tvf/enclose_csv.csv
b/regression-test/data/external_table_p0/tvf/enclose_csv.csv
new file mode 100644
index 00000000000..ca787200ff1
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/enclose_csv.csv
@@ -0,0 +1,6 @@
+id, field1, field2
+"1", "hello", "same, field"
+"2", "doris", "same, field2"
+"3", "nereids", "same, field3"
+"4", "pipeline", "same, field4"
+"5", "storage", "same, field5"
\ No newline at end of file
diff --git
a/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out
b/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out
new file mode 100644
index 00000000000..6e5d10e4858
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !enclose_1 --
+"1" "hello" "same, field"
+"2" "doris" "same, field2"
+"3" "nereids" "same, field3"
+"4" "pipeline" "same, field4"
+"5" "storage" "same, field5"
+
+-- !enclose_2 --
+1 hello same, field
+2 doris same, field2
+3 nereids same, field3
+4 pipeline same, field4
+5 storage same, field5
+
diff --git
a/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy
b/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy
new file mode 100644
index 00000000000..e7437cd20ec
--- /dev/null
+++ b/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy
@@ -0,0 +1,72 @@
+import org.junit.Assert
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_local_tvf_enclose", "p0,tvf") {
+ List<List<Object>> backends = sql """ show backends """
+ assertTrue(backends.size() > 0)
+ def be_id = backends[0][0]
+
+ String filename = "enclose_csv.csv"
+
+ def dataFilePath = context.config.dataPath +
"/external_table_p0/tvf/${filename}"
+
+ def outFilePath="/"
+
+ for (List<Object> backend : backends) {
+ def be_host = backend[1]
+ scpFiles ("root", be_host, dataFilePath, outFilePath, false);
+ }
+
+
+ sql """set enable_nereids_planner=true"""
+ sql """set enable_fallback_to_original_planner=false"""
+
+ qt_enclose_1 """
+ select * from local(
+ "file_path" = "${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv_with_names",
+ "column_separator" = ", ",
+ "enclose" = "\\\"") order by id;
+ """
+
+ qt_enclose_2 """
+ select * from local(
+ "file_path" = "${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv_with_names",
+ "column_separator" = ", ",
+ "enclose" = "\\\"",
+ "trim_double_quotes" = "true") order by id;
+ """
+
+ // test error case
+ test {
+ sql """
+ select * from local(
+ "file_path" = "${filename}",
+ "backend_id" = "${be_id}",
+ "format" = "csv_with_names",
+ "column_separator" = ", ",
+ "enclose" = "\\\"\\\"") order by id;
+ """
+ // check exception message contains
+ exception "enclose should not be longer than one byte."
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]