This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch segiter-map
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 436553b52e915dc1b15ea92fc242087654bf0fee
Author: eldenmoon <[email protected]>
AuthorDate: Fri Aug 25 18:16:09 2023 +0800

    [refactor](segment iterator) remove std::map in iterator use std::vector 
instead and not rely on unique id to idenfy position
    
    1. This could improve segment init performance
    2. variant will use path column_id to identy a column instead of unique id, 
since it's subcolumn does not contain a unique id
---
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 107 +++++++++------------
 be/src/olap/rowset/segment_v2/segment_iterator.h   |  12 +--
 2 files changed, 51 insertions(+), 68 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 87b8014117..090c55439e 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -23,10 +23,12 @@
 
 #include <algorithm>
 #include <boost/iterator/iterator_facade.hpp>
+#include <iterator>
 #include <memory>
 #include <numeric>
 #include <set>
 #include <utility>
+#include <vector>
 
 // IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
 #include "common/compiler_util.h" // IWYU pragma: keep
@@ -198,6 +200,9 @@ private:
 SegmentIterator::SegmentIterator(std::shared_ptr<Segment> segment, SchemaSPtr 
schema)
         : _segment(std::move(segment)),
           _schema(schema),
+          _column_iterators(_schema->num_columns()),
+          _bitmap_index_iterators(_schema->num_columns()),
+          _inverted_index_iterators(_schema->num_columns()),
           _cur_rowid(0),
           _lazy_materialization_read(false),
           _lazy_inited(false),
@@ -368,16 +373,15 @@ Status SegmentIterator::_prepare_seek(const 
StorageReadOptions::KeyRange& key_ra
 
     // create used column iterator
     for (auto cid : _seek_schema->column_ids()) {
-        int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
-        if (_column_iterators.count(unique_id) < 1) {
+        if (_column_iterators[cid] == nullptr) {
             
RETURN_IF_ERROR(_segment->new_column_iterator(_opts.tablet_schema->column(cid),
-                                                          
&_column_iterators[unique_id]));
+                                                          
&_column_iterators[cid]));
             ColumnIteratorOptions iter_opts;
             iter_opts.stats = _opts.stats;
             iter_opts.use_page_cache = _opts.use_page_cache;
             iter_opts.file_reader = _file_reader.get();
             iter_opts.io_ctx = _opts.io_ctx;
-            RETURN_IF_ERROR(_column_iterators[unique_id]->init(iter_opts));
+            RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts));
         }
     }
 
@@ -440,8 +444,7 @@ Status 
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
         // get row ranges by bf index of this column,
         RowRanges column_bf_row_ranges = RowRanges::create_single(num_rows());
         DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
-        uint32_t unique_cid = _schema->unique_id(cid);
-        
RETURN_IF_ERROR(_column_iterators[unique_cid]->get_row_ranges_by_bloom_filter(
+        RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_bloom_filter(
                 _opts.col_id_to_predicates.at(cid).get(), 
&column_bf_row_ranges));
         RowRanges::ranges_intersection(bf_row_ranges, column_bf_row_ranges, 
&bf_row_ranges);
     }
@@ -456,7 +459,7 @@ Status 
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
         // get row ranges by zone map of this column,
         RowRanges column_row_ranges = RowRanges::create_single(num_rows());
         DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
-        
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->get_row_ranges_by_zone_map(
+        RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_zone_map(
                 _opts.col_id_to_predicates.at(cid).get(),
                 _opts.del_predicates_for_zone_map.count(cid) > 0
                         ? &(_opts.del_predicates_for_zone_map.at(cid))
@@ -472,13 +475,12 @@ Status 
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
         auto query_ctx = _opts.runtime_state->get_query_ctx();
         runtime_predicate = 
query_ctx->get_runtime_predicate().get_predictate();
         if (runtime_predicate) {
-            int32_t cid = 
_opts.tablet_schema->column(runtime_predicate->column_id()).unique_id();
             AndBlockColumnPredicate and_predicate;
             auto single_predicate = new 
SingleColumnBlockPredicate(runtime_predicate.get());
             and_predicate.add_column_predicate(single_predicate);
 
             RowRanges column_rp_row_ranges = 
RowRanges::create_single(num_rows());
-            
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->get_row_ranges_by_zone_map(
+            
RETURN_IF_ERROR(_column_iterators[runtime_predicate->column_id()]->get_row_ranges_by_zone_map(
                     &and_predicate, nullptr, &column_rp_row_ranges));
 
             // intersect different columns's row ranges to get final row 
ranges by zone map
@@ -499,8 +501,7 @@ Status 
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
         for (auto cid : cids) {
             RowRanges tmp_row_ranges = RowRanges::create_single(num_rows());
             DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
-            uint32_t unique_cid = _schema->unique_id(cid);
-            
RETURN_IF_ERROR(_column_iterators[unique_cid]->get_row_ranges_by_dict(
+            RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict(
                     _opts.col_id_to_predicates.at(cid).get(), 
&tmp_row_ranges));
             RowRanges::ranges_intersection(dict_row_ranges, tmp_row_ranges, 
&dict_row_ranges);
         }
@@ -523,19 +524,18 @@ Status SegmentIterator::_apply_bitmap_index() {
     std::vector<ColumnPredicate*> remaining_predicates;
 
     for (auto pred : _col_predicates) {
-        int32_t unique_id = _schema->unique_id(pred->column_id());
-        if (_bitmap_index_iterators.count(unique_id) < 1 ||
-            _bitmap_index_iterators[unique_id] == nullptr || pred->type() == 
PredicateType::BF) {
+        auto cid = pred->column_id();
+        if (_bitmap_index_iterators[cid] == nullptr || pred->type() == 
PredicateType::BF) {
             // no bitmap index for this column
             remaining_predicates.push_back(pred);
         } else {
-            
RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[unique_id].get(),
+            RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[cid].get(),
                                            _segment->num_rows(), 
&_row_bitmap));
 
             auto column_name = _schema->column(pred->column_id())->name();
             if (_check_column_pred_all_push_down(column_name) &&
                 !pred->predicate_params()->marked_by_runtime_filter) {
-                _need_read_data_indices[unique_id] = false;
+                _need_read_data_indices[cid] = false;
             }
 
             if (_row_bitmap.isEmpty()) {
@@ -657,9 +657,7 @@ bool 
SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() {
 }
 
 bool SegmentIterator::_check_apply_by_bitmap_index(ColumnPredicate* pred) {
-    int32_t unique_id = _schema->unique_id(pred->column_id());
-    if (_bitmap_index_iterators.count(unique_id) < 1 ||
-        _bitmap_index_iterators[unique_id] == nullptr) {
+    if (_bitmap_index_iterators[pred->column_id()] == nullptr) {
         // no bitmap index for this column
         return false;
     }
@@ -667,9 +665,7 @@ bool 
SegmentIterator::_check_apply_by_bitmap_index(ColumnPredicate* pred) {
 }
 
 bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, 
bool pred_in_compound) {
-    int32_t unique_id = _schema->unique_id(pred->column_id());
-    if (_inverted_index_iterators.count(unique_id) < 1 ||
-        _inverted_index_iterators[unique_id] == nullptr) {
+    if (_inverted_index_iterators[pred->column_id()] == nullptr) {
         //this column without inverted index
         return false;
     }
@@ -689,7 +685,7 @@ bool 
SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool
         return false;
     }
 
-    bool handle_by_fulltext = _column_has_fulltext_index(unique_id);
+    bool handle_by_fulltext = _column_has_fulltext_index(pred->column_id());
     if (handle_by_fulltext) {
         // when predicate in compound condition which except leafNode of 
andNode,
         // only can apply match query for fulltext index,
@@ -705,8 +701,7 @@ bool 
SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool
 
 Status SegmentIterator::_apply_bitmap_index_except_leafnode_of_andnode(
         ColumnPredicate* pred, roaring::Roaring* output_result) {
-    int32_t unique_id = _schema->unique_id(pred->column_id());
-    RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[unique_id].get(), 
_segment->num_rows(),
+    
RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[pred->column_id()].get(),
 _segment->num_rows(),
                                    output_result));
     return Status::OK();
 }
@@ -716,8 +711,7 @@ Status 
SegmentIterator::_apply_inverted_index_except_leafnode_of_andnode(
     if (_opts.runtime_state && 
!_opts.runtime_state->query_options().enable_inverted_index_query) {
         return Status::OK();
     }
-    int32_t unique_id = _schema->unique_id(pred->column_id());
-    RETURN_IF_ERROR(pred->evaluate(*_schema, 
_inverted_index_iterators[unique_id].get(), num_rows(),
+    RETURN_IF_ERROR(pred->evaluate(*_schema, 
_inverted_index_iterators[pred->column_id()].get(), num_rows(),
                                    output_result));
     return Status::OK();
 }
@@ -745,8 +739,7 @@ Status 
SegmentIterator::_apply_index_except_leafnode_of_andnode() {
             continue;
         }
 
-        int32_t unique_id = _schema->unique_id(pred->column_id());
-        bool need_remaining_after_evaluate = 
_column_has_fulltext_index(unique_id) &&
+        bool need_remaining_after_evaluate = 
_column_has_fulltext_index(pred->column_id()) &&
                                              
PredicateTypeTraits::is_equal_or_list(pred_type);
         if (!res.ok()) {
             if (_downgrade_without_index(res, need_remaining_after_evaluate)) {
@@ -771,8 +764,7 @@ Status 
SegmentIterator::_apply_index_except_leafnode_of_andnode() {
             _check_column_pred_all_push_down(column_name, true,
                                              pred->type() == 
PredicateType::MATCH) &&
             !pred->predicate_params()->marked_by_runtime_filter) {
-            int32_t unique_id = _schema->unique_id(pred->column_id());
-            _need_read_data_indices[unique_id] = false;
+            _need_read_data_indices[pred->column_id()] = false;
         }
     }
 
@@ -823,11 +815,10 @@ std::string 
SegmentIterator::_gen_predicate_result_sign(ColumnPredicateInfo* pre
     return pred_result_sign;
 }
 
-bool SegmentIterator::_column_has_fulltext_index(int32_t unique_id) {
+bool SegmentIterator::_column_has_fulltext_index(int32_t cid) {
     bool has_fulltext_index =
-            _inverted_index_iterators.count(unique_id) > 0 &&
-            _inverted_index_iterators[unique_id] != nullptr &&
-            
_inverted_index_iterators[unique_id]->get_inverted_index_reader_type() ==
+            _inverted_index_iterators[cid] != nullptr &&
+            _inverted_index_iterators[cid]->get_inverted_index_reader_type() ==
                     InvertedIndexReaderType::FULLTEXT;
 
     return has_fulltext_index;
@@ -852,11 +843,10 @@ Status 
SegmentIterator::_apply_inverted_index_on_column_predicate(
     if (!_check_apply_by_inverted_index(pred)) {
         remaining_predicates.emplace_back(pred);
     } else {
-        int32_t unique_id = _schema->unique_id(pred->column_id());
-        bool need_remaining_after_evaluate = 
_column_has_fulltext_index(unique_id) &&
+        bool need_remaining_after_evaluate = 
_column_has_fulltext_index(pred->column_id()) &&
                                              
PredicateTypeTraits::is_equal_or_list(pred->type());
         roaring::Roaring bitmap = _row_bitmap;
-        Status res = pred->evaluate(*_schema, 
_inverted_index_iterators[unique_id].get(),
+        Status res = pred->evaluate(*_schema, 
_inverted_index_iterators[pred->column_id()].get(),
                                     num_rows(), &bitmap);
         if (!res.ok()) {
             if (_downgrade_without_index(res, need_remaining_after_evaluate)) {
@@ -891,7 +881,7 @@ Status 
SegmentIterator::_apply_inverted_index_on_column_predicate(
         if (_check_column_pred_all_push_down(column_name, false,
                                              pred->type() == 
PredicateType::MATCH) &&
             !pred->predicate_params()->marked_by_runtime_filter) {
-            _need_read_data_indices[unique_id] = false;
+            _need_read_data_indices[pred->column_id()] = false;
         }
     }
     return Status::OK();
@@ -901,8 +891,7 @@ Status 
SegmentIterator::_apply_inverted_index_on_block_column_predicate(
         ColumnId column_id, MutilColumnBlockPredicate* pred,
         std::set<const ColumnPredicate*>& no_need_to_pass_column_predicate_set,
         bool* continue_apply) {
-    auto unique_id = _schema->unique_id(column_id);
-    bool handle_by_fulltext = _column_has_fulltext_index(unique_id);
+    bool handle_by_fulltext = _column_has_fulltext_index(column_id);
     std::set<const ColumnPredicate*> predicate_set {};
 
     pred->get_all_column_predicate(predicate_set);
@@ -912,20 +901,19 @@ Status 
SegmentIterator::_apply_inverted_index_on_block_column_predicate(
     //2. There are multiple predicates for this column.
     //3. All the predicates are range predicate.
     //4. if it's under fulltext parser type, we need to skip inverted index 
evaluate.
-    if (_inverted_index_iterators.count(unique_id) > 0 &&
-        _inverted_index_iterators[unique_id] != nullptr && 
predicate_set.size() > 1 &&
+    if (_inverted_index_iterators[column_id] != nullptr && 
predicate_set.size() > 1 &&
         all_predicates_are_range_predicate(predicate_set) && 
!handle_by_fulltext) {
         roaring::Roaring output_result = _row_bitmap;
 
         std::string column_name = _schema->column(column_id)->name();
 
-        auto res = pred->evaluate(column_name, 
_inverted_index_iterators[unique_id].get(),
+        auto res = pred->evaluate(column_name, 
_inverted_index_iterators[column_id].get(),
                                   num_rows(), &output_result);
 
         if (res.ok()) {
             if (_check_column_pred_all_push_down(column_name) &&
                 !all_predicates_are_marked_by_runtime_filter(predicate_set)) {
-                _need_read_data_indices[unique_id] = false;
+                _need_read_data_indices[column_id] = false;
             }
             no_need_to_pass_column_predicate_set.insert(predicate_set.begin(), 
predicate_set.end());
             _row_bitmap &= output_result;
@@ -957,7 +945,7 @@ bool SegmentIterator::_need_read_data(ColumnId cid) {
         return true;
     }
     int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
-    if (_need_read_data_indices.count(unique_id) > 0 && 
!_need_read_data_indices[unique_id] &&
+    if (_need_read_data_indices.count(cid) > 0 && 
!_need_read_data_indices[cid] &&
         _output_columns.count(unique_id) < 1) {
         VLOG_DEBUG << "SegmentIterator no need read data for column: "
                    << _opts.tablet_schema->column_by_uid(unique_id).name();
@@ -1010,7 +998,7 @@ Status SegmentIterator::_init_return_column_iterators() {
 
     for (auto cid : _schema->column_ids()) {
         if (_schema->column(cid)->name() == BeConsts::ROWID_COL) {
-            _column_iterators[_schema->column(cid)->unique_id()].reset(
+            _column_iterators[cid].reset(
                     new RowIdColumnIterator(_opts.tablet_id, _opts.rowset_id, 
_segment->id()));
             continue;
         }
@@ -1027,10 +1015,9 @@ Status SegmentIterator::_init_return_column_iterators() {
             tmp_is_pred_column[cid] = true;
         }
 
-        int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
-        if (_column_iterators.count(unique_id) < 1) {
+        if (_column_iterators[cid] == nullptr) {
             
RETURN_IF_ERROR(_segment->new_column_iterator(_opts.tablet_schema->column(cid),
-                                                          
&_column_iterators[unique_id]));
+                                                          
&_column_iterators[cid]));
             ColumnIteratorOptions iter_opts;
             iter_opts.stats = _opts.stats;
             iter_opts.use_page_cache = _opts.use_page_cache;
@@ -1039,7 +1026,7 @@ Status SegmentIterator::_init_return_column_iterators() {
             // If the col is predicate column, then should read the last page 
to check
             // if the column is full dict encoding
             iter_opts.is_predicate_column = tmp_is_pred_column[cid];
-            RETURN_IF_ERROR(_column_iterators[unique_id]->init(iter_opts));
+            RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts));
         }
     }
     return Status::OK();
@@ -1050,10 +1037,9 @@ Status SegmentIterator::_init_bitmap_index_iterators() {
         return Status::OK();
     }
     for (auto cid : _schema->column_ids()) {
-        int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
-        if (_bitmap_index_iterators.count(unique_id) < 1) {
+        if (_bitmap_index_iterators[cid] == nullptr) {
             RETURN_IF_ERROR(_segment->new_bitmap_index_iterator(
-                    _opts.tablet_schema->column(cid), 
&_bitmap_index_iterators[unique_id]));
+                    _opts.tablet_schema->column(cid), 
&_bitmap_index_iterators[cid]));
         }
     }
     return Status::OK();
@@ -1064,11 +1050,10 @@ Status 
SegmentIterator::_init_inverted_index_iterators() {
         return Status::OK();
     }
     for (auto cid : _schema->column_ids()) {
-        int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
-        if (_inverted_index_iterators.count(unique_id) < 1) {
+        if (_inverted_index_iterators[cid] == nullptr) {
             RETURN_IF_ERROR(_segment->new_inverted_index_iterator(
                     _opts.tablet_schema->column(cid), 
_opts.tablet_schema->get_inverted_index(cid),
-                    _opts.stats, &_inverted_index_iterators[unique_id]));
+                    _opts.stats, &_inverted_index_iterators[cid]));
         }
     }
     return Status::OK();
@@ -1241,7 +1226,7 @@ Status SegmentIterator::_seek_columns(const 
std::vector<ColumnId>& column_ids, r
         if (!_need_read_data(cid)) {
             continue;
         }
-        
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->seek_to_ordinal(pos));
+        RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(pos));
     }
     return Status::OK();
 }
@@ -1460,7 +1445,7 @@ bool 
SegmentIterator::_can_evaluated_by_vectorized(ColumnPredicate* predicate) {
             field_type == FieldType::OLAP_FIELD_TYPE_STRING) {
             return config::enable_low_cardinality_optimize &&
                    _opts.io_ctx.reader_type == ReaderType::READER_QUERY &&
-                   
_column_iterators[_schema->unique_id(cid)]->is_all_dict_encoding();
+                   _column_iterators[cid]->is_all_dict_encoding();
         } else if (field_type == FieldType::OLAP_FIELD_TYPE_DECIMAL) {
             return false;
         }
@@ -1519,7 +1504,7 @@ Status SegmentIterator::_read_columns(const 
std::vector<ColumnId>& column_ids,
         if (_prune_column(cid, column, true, rows_read)) {
             continue;
         }
-        
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->next_batch(&rows_read,
 column));
+        RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read, 
column));
         if (nrows != rows_read) {
             return Status::Error<ErrorCode::INTERNAL_ERROR>("nrows({}) != 
rows_read({})", nrows,
                                                             rows_read);
@@ -1733,7 +1718,7 @@ Status 
SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>& read_colu
         if (_prune_column(cid, (*mutable_columns)[cid], true, select_size)) {
             continue;
         }
-        
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->read_by_rowids(
+        RETURN_IF_ERROR(_column_iterators[cid]->read_by_rowids(
                 rowids.data(), select_size, _current_return_columns[cid]));
     }
 
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h 
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index a25cf1b5bf..a44d0b6eba 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -187,7 +187,7 @@ private:
             ColumnPredicate* pred, roaring::Roaring* output_result);
     [[nodiscard]] Status _apply_inverted_index_except_leafnode_of_andnode(
             ColumnPredicate* pred, roaring::Roaring* output_result);
-    bool _column_has_fulltext_index(int32_t unique_id);
+    bool _column_has_fulltext_index(int32_t cid);
     bool _downgrade_without_index(Status res, bool need_remaining = false);
     inline bool _inverted_index_not_support_pred_type(const PredicateType& 
type);
     bool _can_filter_by_preds_except_leafnode_of_andnode();
@@ -331,12 +331,10 @@ private:
 
     std::shared_ptr<Segment> _segment;
     SchemaSPtr _schema;
-    // _column_iterators_map.size() == _schema.num_columns()
-    // map<unique_id, ColumnIterator*> 
_column_iterators_map/_bitmap_index_iterators;
-    // can use _schema get unique_id by cid
-    std::map<int32_t, std::unique_ptr<ColumnIterator>> _column_iterators;
-    std::map<int32_t, std::unique_ptr<BitmapIndexIterator>> 
_bitmap_index_iterators;
-    std::map<int32_t, std::unique_ptr<InvertedIndexIterator>> 
_inverted_index_iterators;
+    // vector idx -> column iterarator
+    std::vector<std::unique_ptr<ColumnIterator>> _column_iterators;
+    std::vector<std::unique_ptr<BitmapIndexIterator>> _bitmap_index_iterators;
+    std::vector<std::unique_ptr<InvertedIndexIterator>> 
_inverted_index_iterators;
     // after init(), `_row_bitmap` contains all rowid to scan
     roaring::Roaring _row_bitmap;
     // "column_name+operator+value-> <in_compound_query, rowid_result>


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to