(doris) branch variant-sparse updated: [fix](variant) fix sparse column reader (#49211)

eldenmoon Tue, 18 Mar 2025 06:20:49 -0700

This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/variant-sparse by this push:
     new b2127eb305e [fix](variant) fix sparse column reader (#49211)
b2127eb305e is described below

commit b2127eb305e84d49a3667bddf160cdb2168e9da4
Author: Sun Chenyang <[email protected]>
AuthorDate: Tue Mar 18 21:15:38 2025 +0800

    [fix](variant) fix sparse column reader (#49211)
---
 be/src/olap/rowset/segment_v2/column_reader.cpp | 18 +++++++++---------
 be/src/vec/common/schema_util.cpp               | 13 +++++++++++--
 be/src/vec/data_types/data_type_object.h        |  3 ++-
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 7d9e375891c..2edbb3a1350 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -378,6 +378,14 @@ Status 
VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIterator** iter
     const auto* node =
             target_col.has_path_info() ? 
_subcolumn_readers->find_leaf(relative_path) : nullptr;
     if (!node) {
+        if (relative_path.get_path() == SPARSE_COLUMN_PATH) {
+            // read sparse column and filter extracted columns in 
subcolumn_path_map
+            ColumnIterator* inner_iter;
+            RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter));
+            // get subcolumns in sparse path set which will be merged into 
sparse column
+            RETURN_IF_ERROR(_create_sparse_merge_reader(iterator, opts, 
target_col, inner_iter));
+            return Status::OK();
+        }
         if (existed_in_sparse_column || exceeded_sparse_column_limit) {
             // Sparse column exists or reached sparse size limit, read sparse 
column
             ColumnIterator* inner_iter;
@@ -389,14 +397,6 @@ Status 
VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIterator** iter
                     const_cast<StorageReadOptions*>(opts), target_col);
             return Status::OK();
         }
-        if (relative_path.get_path() == SPARSE_COLUMN_PATH) {
-            // read sparse column and filter extracted columns in 
subcolumn_path_map
-            ColumnIterator* inner_iter;
-            RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter));
-            // get subcolumns in sparse path set which will be merged into 
sparse column
-            RETURN_IF_ERROR(_create_sparse_merge_reader(iterator, opts, 
target_col, inner_iter));
-            return Status::OK();
-        }
         if (target_col.is_nested_subcolumn()) {
             // using the sibling of the nested column to fill the target 
nested column
             RETURN_IF_ERROR(_new_default_iter_with_same_nested(iterator, 
target_col));
@@ -434,7 +434,7 @@ Status VariantColumnReader::new_iterator(ColumnIterator** 
iterator, const Tablet
     // Otherwise the prefix is not exist and the sparse column size is reached 
limit
     // which means the path maybe exist in sparse_column
     bool exceeded_sparse_column_limit = 
!_statistics->sparse_column_non_null_size.empty() &&
-                                        
_statistics->sparse_column_non_null_size.size() >
+                                        
_statistics->sparse_column_non_null_size.size() ==
                                                 
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE;
 
     // For compaction operations, read flat leaves, otherwise read 
hierarchical data
diff --git a/be/src/vec/common/schema_util.cpp 
b/be/src/vec/common/schema_util.cpp
index 047d488e5ad..a7d34e80961 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -820,6 +820,8 @@ Status get_compaction_schema(const 
std::vector<RowsetSharedPtr>& rowsets,
 void calculate_variant_stats(const IColumn& encoded_sparse_column,
                              segment_v2::VariantStatisticsPB* stats, size_t 
row_pos,
                              size_t num_rows) {
+    size_t limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE -
+                   stats->sparse_column_non_null_size().size();
     // Cast input column to ColumnMap type since sparse column is stored as a 
map
     const auto& map_column = assert_cast<const 
ColumnMap&>(encoded_sparse_column);
 
@@ -844,8 +846,7 @@ void calculate_variant_stats(const IColumn& 
encoded_sparse_column,
             }
             // If path doesn't exist and we haven't hit the max statistics 
size limit,
             // add it with count 1
-            else if (sparse_data_paths_statistics.size() <
-                     VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) {
+            else if (sparse_data_paths_statistics.size() < limit) {
                 sparse_data_paths_statistics.emplace(path, 1);
             }
         }
@@ -862,6 +863,14 @@ void calculate_variant_stats(const IColumn& 
encoded_sparse_column,
             count_map.emplace(sparse_path, size);
         }
     }
+    if (stats->sparse_column_non_null_size().size() >
+        VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) {
+        throw doris::Exception(
+                ErrorCode::INTERNAL_ERROR,
+                "Sparse column non null size: {} is greater than max 
statistics size: {}",
+                stats->sparse_column_non_null_size().size(),
+                VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+    }
 }
 
 #include "common/compile_check_end.h"
diff --git a/be/src/vec/data_types/data_type_object.h 
b/be/src/vec/data_types/data_type_object.h
index ad27c57ec68..24f23fb9000 100644
--- a/be/src/vec/data_types/data_type_object.h
+++ b/be/src/vec/data_types/data_type_object.h
@@ -55,7 +55,8 @@ private:
 public:
     DataTypeObject() {}
     DataTypeObject(int32_t max_subcolumns_count);
-    const char* get_family_name() const override { return name.c_str(); }
+    String do_get_name() const override { return name; }
+    const char* get_family_name() const override { return "Variant"; }
     TypeIndex get_type_id() const override { return TypeIndex::VARIANT; }
     TypeDescriptor get_type_as_type_descriptor() const override {
         return TypeDescriptor(TYPE_VARIANT, _max_subcolumns_count);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch variant-sparse updated: [fix](variant) fix sparse column reader (#49211)

Reply via email to