This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 4e8148105aa [fix](serde)Fixed the issue that serde may cause be core
when reading schema changed text table. (#50105) (#50504)
4e8148105aa is described below
commit 4e8148105aad3c812ebdd64620eb6535df8ce953
Author: daidai <[email protected]>
AuthorDate: Tue Apr 29 12:54:43 2025 +0800
[fix](serde)Fixed the issue that serde may cause be core when reading
schema changed text table. (#50105) (#50504)
bp #50105
---
.../data_types/serde/data_type_struct_serde.cpp | 6 +
be/src/vec/exec/format/csv/csv_reader.cpp | 6 +-
.../data_types/serde/data_type_serde_csv_test.cpp | 232 +++++++++++++++++++++
3 files changed, 241 insertions(+), 3 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
index d48f42e2227..02e8fb17bf2 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
@@ -257,6 +257,12 @@ Status
DataTypeStructSerDe::deserialize_one_cell_from_hive_text(
}
}
auto& struct_column = static_cast<ColumnStruct&>(column);
+
+ for (auto i = slices.size(); i < struct_column.get_columns().size(); ++i) {
+ // Hive schema change will cause the number of sub-columns in the file
to
+ // be inconsistent with the number of sub-columns of the column in the
table.
+ slices.emplace_back(options.null_format, options.null_len);
+ }
for (size_t loc = 0; loc < struct_column.get_columns().size(); loc++) {
Status st = elem_serdes_ptrs[loc]->deserialize_one_cell_from_hive_text(
struct_column.get_column(loc), slices[loc], options,
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 660e25b2b72..5e37e4834dc 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -67,8 +67,6 @@ enum class FileCachePolicy : uint8_t;
namespace doris::vectorized {
-const static Slice _s_null_slice = Slice("\\N");
-
void EncloseCsvTextFieldSplitter::do_split(const Slice& line,
std::vector<Slice>* splitted_values) {
const char* data = line.data;
const auto& column_sep_positions =
_text_line_reader_ctx->column_sep_positions();
@@ -656,7 +654,9 @@ Status CsvReader::_fill_dest_columns(const Slice& line,
Block* block,
int col_idx = _col_idxs[i];
// col idx is out of range, fill with null.
const Slice& value =
- col_idx < _split_values.size() ? _split_values[col_idx] :
_s_null_slice;
+ col_idx < _split_values.size()
+ ? _split_values[col_idx]
+ : Slice {_options.null_format,
static_cast<size_t>(_options.null_len)};
Slice slice {value.data, value.size};
IColumn* col_ptr = columns[i];
diff --git a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
index 936d495cc92..b3e49fdcf8c 100644
--- a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
@@ -19,6 +19,9 @@
#include "olap/types.h" // for TypeInfo
#include "olap/wrapper_field.h"
#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
#include "vec/common/string_buffer.hpp"
#include "vec/core/field.h"
#include "vec/data_types/data_type.h"
@@ -482,4 +485,233 @@ TEST(CsvSerde, ComplexTypeSerdeCsvTest) {
EXPECT_EQ(str, rand_s_d.to_string());
}
}
+
+TEST(CsvSerde, ComplexTypeSerdeSchemaChangedCsvTest) {
+ { //struct<string, string> => struct<string, string, string>
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+
+ string str = "false\002example";
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ auto struct_col = static_cast<ColumnStruct&>(
+ static_cast<ColumnNullable&>(*col.get()).get_nested_column());
+ EXPECT_EQ(struct_col.get_column(0).get_data_at(0).to_string(),
"false");
+ EXPECT_EQ(struct_col.get_column(1).get_data_at(0).to_string(),
"example");
+
+ EXPECT_EQ(struct_col.get_column(0).is_null_at(0), false);
+ EXPECT_EQ(struct_col.get_column(1).is_null_at(0), false);
+ EXPECT_EQ(struct_col.get_column(2).is_null_at(0), true);
+ }
+
+ { // Map<int,String> => array<string>
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+
+ string str = "1\003example\0022\003test";
+
+ DataTypePtr data_type_ptr = make_nullable(
+
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>())));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ auto array_col = static_cast<ColumnArray&>(
+ static_cast<ColumnNullable&>(*col.get()).get_nested_column());
+
+ auto string_col = static_cast<ColumnString&>(
+
static_cast<ColumnNullable&>(array_col.get_data()).get_nested_column());
+ EXPECT_EQ(string_col.get_data_at(0).to_string(), "1\003example");
+ EXPECT_EQ(string_col.get_data_at(1).to_string(), "2\003test");
+ }
+
+ { // null
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ std::string null_format = "null";
+ formatOptions.escape_char = '|';
+ formatOptions.null_format = null_format.data();
+ formatOptions.null_len = null_format.size();
+
+ static const string str = "null";
+
+ DataTypePtr data_type_ptr = make_nullable(
+
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>())));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ EXPECT_EQ(col->is_null_at(0), 1);
+ }
+
+ { // \\N
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ std::string null_format = "null";
+ formatOptions.escape_char = '|';
+ formatOptions.null_format = null_format.data();
+ formatOptions.null_len = null_format.size();
+
+ static const string str = "\\N";
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ EXPECT_EQ(col->is_null_at(0), 0);
+ }
+
+ { // \\N
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ formatOptions.escape_char = '|';
+
+ static const string str = "\\N";
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ EXPECT_EQ(col->is_null_at(0), 1);
+ }
+
+ { // random
+ auto randomControlChar = [&]() { return static_cast<char>(rand() % 7 +
2); };
+
+ auto randomPrintableChar = []() { return static_cast<char>(rand() %
(126 - 32 + 1) + 32); };
+
+ auto generateMixedString = [&](int n) -> std::string {
+ std::string result;
+ for (int i = 0; i < n; ++i) {
+ if (rand() % 4 == 0) {
+ result += randomControlChar();
+ } else {
+ result += randomPrintableChar();
+ }
+ }
+ for (unsigned char c : result) {
+ printf("\\x%02X ", c);
+ }
+ std::cout << std::endl;
+
+ return result;
+ };
+
+ std::srand(std::time(nullptr));
+
+ for (int i = 0; i < 100; i++) {
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ string str = generateMixedString(rand() % 100 + 10);
+
+#define TEST_REPLACE
\
+ auto col = data_type_ptr->create_column();
\
+ Slice slice(str.data(), str.size());
\
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
\
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions); \
+ EXPECT_EQ(st, Status::OK());
+
+ {
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ TEST_REPLACE
+ }
+
+ {
+ DataTypePtr data_type_ptr = std::make_shared<DataTypeMap>(
+ make_nullable(std::make_shared<DataTypeInt32>()),
+ make_nullable(std::make_shared<DataTypeMap>(
+
make_nullable(std::make_shared<DataTypeString>()),
+
make_nullable(std::make_shared<DataTypeInt32>()))));
+
+ TEST_REPLACE
+ }
+
+ {
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
make_nullable(std::make_shared<DataTypeMap>(
+ make_nullable(std::make_shared<DataTypeInt32>()),
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes))));
+ TEST_REPLACE
+ }
+
+ {
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>()));
+
+ DataTypes struct_dataTypes;
+
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>()));
+
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeMap>(
+ make_nullable(std::make_shared<DataTypeInt32>()),
+ make_nullable(std::make_shared<DataTypeString>()))));
+ struct_dataTypes.push_back(
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)));
+
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeArray>(
+ make_nullable(std::make_shared<DataTypeInt32>()))));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(struct_dataTypes));
+ TEST_REPLACE
+ }
+
+ {
+ DataTypePtr data_type_ptr =
make_nullable(std::make_shared<DataTypeArray>(
+ make_nullable(std::make_shared<DataTypeArray>(
+ make_nullable(std::make_shared<DataTypeMap>(
+
make_nullable(std::make_shared<DataTypeInt32>()),
+
make_nullable(std::make_shared<DataTypeString>())))))));
+ TEST_REPLACE
+ }
+#undef TEST_REPLACE
+ }
+ }
+}
+
} // namespace doris::vectorized
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]