This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push:
new b2127eb305e [fix](variant) fix sparse column reader (#49211)
b2127eb305e is described below
commit b2127eb305e84d49a3667bddf160cdb2168e9da4
Author: Sun Chenyang <[email protected]>
AuthorDate: Tue Mar 18 21:15:38 2025 +0800
[fix](variant) fix sparse column reader (#49211)
---
be/src/olap/rowset/segment_v2/column_reader.cpp | 18 +++++++++---------
be/src/vec/common/schema_util.cpp | 13 +++++++++++--
be/src/vec/data_types/data_type_object.h | 3 ++-
3 files changed, 22 insertions(+), 12 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 7d9e375891c..2edbb3a1350 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -378,6 +378,14 @@ Status
VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIterator** iter
const auto* node =
target_col.has_path_info() ?
_subcolumn_readers->find_leaf(relative_path) : nullptr;
if (!node) {
+ if (relative_path.get_path() == SPARSE_COLUMN_PATH) {
+ // read sparse column and filter extracted columns in
subcolumn_path_map
+ ColumnIterator* inner_iter;
+ RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter));
+ // get subcolumns in sparse path set which will be merged into
sparse column
+ RETURN_IF_ERROR(_create_sparse_merge_reader(iterator, opts,
target_col, inner_iter));
+ return Status::OK();
+ }
if (existed_in_sparse_column || exceeded_sparse_column_limit) {
// Sparse column exists or reached sparse size limit, read sparse
column
ColumnIterator* inner_iter;
@@ -389,14 +397,6 @@ Status
VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIterator** iter
const_cast<StorageReadOptions*>(opts), target_col);
return Status::OK();
}
- if (relative_path.get_path() == SPARSE_COLUMN_PATH) {
- // read sparse column and filter extracted columns in
subcolumn_path_map
- ColumnIterator* inner_iter;
- RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter));
- // get subcolumns in sparse path set which will be merged into
sparse column
- RETURN_IF_ERROR(_create_sparse_merge_reader(iterator, opts,
target_col, inner_iter));
- return Status::OK();
- }
if (target_col.is_nested_subcolumn()) {
// using the sibling of the nested column to fill the target
nested column
RETURN_IF_ERROR(_new_default_iter_with_same_nested(iterator,
target_col));
@@ -434,7 +434,7 @@ Status VariantColumnReader::new_iterator(ColumnIterator**
iterator, const Tablet
// Otherwise the prefix is not exist and the sparse column size is reached
limit
// which means the path maybe exist in sparse_column
bool exceeded_sparse_column_limit =
!_statistics->sparse_column_non_null_size.empty() &&
-
_statistics->sparse_column_non_null_size.size() >
+
_statistics->sparse_column_non_null_size.size() ==
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE;
// For compaction operations, read flat leaves, otherwise read
hierarchical data
diff --git a/be/src/vec/common/schema_util.cpp
b/be/src/vec/common/schema_util.cpp
index 047d488e5ad..a7d34e80961 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -820,6 +820,8 @@ Status get_compaction_schema(const
std::vector<RowsetSharedPtr>& rowsets,
void calculate_variant_stats(const IColumn& encoded_sparse_column,
segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
size_t num_rows) {
+ size_t limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE -
+ stats->sparse_column_non_null_size().size();
// Cast input column to ColumnMap type since sparse column is stored as a
map
const auto& map_column = assert_cast<const
ColumnMap&>(encoded_sparse_column);
@@ -844,8 +846,7 @@ void calculate_variant_stats(const IColumn&
encoded_sparse_column,
}
// If path doesn't exist and we haven't hit the max statistics
size limit,
// add it with count 1
- else if (sparse_data_paths_statistics.size() <
- VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) {
+ else if (sparse_data_paths_statistics.size() < limit) {
sparse_data_paths_statistics.emplace(path, 1);
}
}
@@ -862,6 +863,14 @@ void calculate_variant_stats(const IColumn&
encoded_sparse_column,
count_map.emplace(sparse_path, size);
}
}
+ if (stats->sparse_column_non_null_size().size() >
+ VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) {
+ throw doris::Exception(
+ ErrorCode::INTERNAL_ERROR,
+ "Sparse column non null size: {} is greater than max
statistics size: {}",
+ stats->sparse_column_non_null_size().size(),
+ VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+ }
}
#include "common/compile_check_end.h"
diff --git a/be/src/vec/data_types/data_type_object.h
b/be/src/vec/data_types/data_type_object.h
index ad27c57ec68..24f23fb9000 100644
--- a/be/src/vec/data_types/data_type_object.h
+++ b/be/src/vec/data_types/data_type_object.h
@@ -55,7 +55,8 @@ private:
public:
DataTypeObject() {}
DataTypeObject(int32_t max_subcolumns_count);
- const char* get_family_name() const override { return name.c_str(); }
+ String do_get_name() const override { return name; }
+ const char* get_family_name() const override { return "Variant"; }
TypeIndex get_type_id() const override { return TypeIndex::VARIANT; }
TypeDescriptor get_type_as_type_descriptor() const override {
return TypeDescriptor(TYPE_VARIANT, _max_subcolumns_count);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]