This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4434ba4e7b5 [refine](predicate) reduce HybridSet/InListPredicate 
template bloat and fix create_string_value_set bug (#61999)
4434ba4e7b5 is described below

commit 4434ba4e7b51576c71b7b08ccd163188d4874b72
Author: Mryange <[email protected]>
AuthorDate: Thu Apr 2 20:01:02 2026 +0800

    [refine](predicate) reduce HybridSet/InListPredicate template bloat and fix 
create_string_value_set bug (#61999)
    
    The HybridSet / FixedContainer / InListPredicateBase template system has
    ~264 redundant template instantiations and a latent bug. This PR fixes 4
    issues:
    
    **1. Bug: `in.h` `create_string_value_set` matches wrong overload**
    
    `create_string_value_set(get_size_with_out_null(context))` silently
    matches the single-parameter template overload
    `create_string_value_set(bool)` via implicit `size_t→bool` conversion,
    instead of the intended two-parameter `create_string_value_set(size_t,
    bool)`. This causes `FixedContainer<StringRef, N>` to never activate for
    string IN queries. Fixed by calling `create_string_value_set(size,
    true)`.
    
    **2. Eliminate 24 redundant instances in `create_set` path**
    
    String types always use `StringSet<DynamicContainer>` regardless of N,
    but previous code still instantiated all N variants. Added `if constexpr
    (is_string_type(type))` short-circuit in
    `HybridSetTraits::get_function`.
    
    **3. Eliminate ~320 redundant instances in `InListPredicateBase`**
    
    - Non-string types in the constructor simply do `_values = hybrid_set`
    (pointer sharing), making N irrelevant. Moved N-dispatch behind `if
    constexpr (is_string_type(TYPE))` in both
    `predicate_creator_in_list_*.cpp`.
    - Date/DECIMALV2 types have identical `ElementType` and `ContainerType`
    between caller's HybridSet and InListPredicateBase — the data copy was
    unnecessary. Narrowed the copy branch to string-only.
    
    **4. Simplify `FixedContainer::find()` with fold expression**
    
    Replaced ~60-line manual `if constexpr` expansion (N=0..8) with a C++17
    fold expression + `std::index_sequence`. Identical assembly at `-O2`.
    
    ### Changed files
    
    | File | Change |
    |------|--------|
    | `be/src/exprs/function/in.h` | Fix `create_string_value_set` call to
    use 2-arg overload |
    | `be/src/exprs/create_predicate_function.h` | Compile-time
    short-circuit for string types |
    | `be/src/exprs/hybrid_set.h` | Fold expression rewrite of
    `FixedContainer::find()` |
    | `be/src/storage/predicate/in_list_predicate.h` | Narrow if-branch to
    string-only; remove unnecessary data copy for date/DECIMALV2 |
    | `be/src/storage/predicate/predicate_creator_in_list_in.cpp` |
    N-dispatch only for string types |
    | `be/src/storage/predicate/predicate_creator_in_list_not_in.cpp` | Same
    as above |
---
 be/src/exprs/create_predicate_function.h           | 17 +++---
 be/src/exprs/function/in.h                         |  2 +-
 be/src/exprs/hybrid_set.h                          | 65 ++++------------------
 be/src/storage/predicate/in_list_predicate.h       | 60 ++++++++++----------
 .../predicate/predicate_creator_in_list_in.cpp     | 60 +++++++++++---------
 .../predicate/predicate_creator_in_list_not_in.cpp | 60 +++++++++++---------
 6 files changed, 115 insertions(+), 149 deletions(-)

diff --git a/be/src/exprs/create_predicate_function.h 
b/be/src/exprs/create_predicate_function.h
index a9146732a1f..94f8802569c 100644
--- a/be/src/exprs/create_predicate_function.h
+++ b/be/src/exprs/create_predicate_function.h
@@ -48,17 +48,14 @@ public:
     using BasePtr = HybridSetBase*;
     template <PrimitiveType type, size_t N>
     static BasePtr get_function(bool null_aware) {
-        if constexpr (N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE) {
-            using Set = std::conditional_t<
-                    is_string_type(type), StringSet<>,
-                    HybridSet<type,
-                              FixedContainer<typename 
PrimitiveTypeTraits<type>::CppType, N>>>;
-            return new Set(null_aware);
+        if constexpr (is_string_type(type)) {
+            return new StringSet<>(null_aware);
+        } else if constexpr (N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE) {
+            using CppType = typename PrimitiveTypeTraits<type>::CppType;
+            return new HybridSet<type, FixedContainer<CppType, N>>(null_aware);
         } else {
-            using Set = std::conditional_t<
-                    is_string_type(type), StringSet<>,
-                    HybridSet<type, DynamicContainer<typename 
PrimitiveTypeTraits<type>::CppType>>>;
-            return new Set(null_aware);
+            using CppType = typename PrimitiveTypeTraits<type>::CppType;
+            return new HybridSet<type, DynamicContainer<CppType>>(null_aware);
         }
     }
 };
diff --git a/be/src/exprs/function/in.h b/be/src/exprs/function/in.h
index 75b1a6753f6..84058977d91 100644
--- a/be/src/exprs/function/in.h
+++ b/be/src/exprs/function/in.h
@@ -113,7 +113,7 @@ public:
                    context->get_arg_type(0)->get_primitive_type() == 
PrimitiveType::TYPE_VARCHAR ||
                    context->get_arg_type(0)->get_primitive_type() == 
PrimitiveType::TYPE_STRING) {
             // the StringValue's memory is held by FunctionContext, so we can 
use StringValueSet here directly
-            
state->hybrid_set.reset(create_string_value_set(get_size_with_out_null(context)));
+            
state->hybrid_set.reset(create_string_value_set(get_size_with_out_null(context),
 true));
         } else {
             
state->hybrid_set.reset(create_set(context->get_arg_type(0)->get_primitive_type(),
                                                
get_size_with_out_null(context), true));
diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h
index 8b4cda61aef..6c938792123 100644
--- a/be/src/exprs/hybrid_set.h
+++ b/be/src/exprs/hybrid_set.h
@@ -72,65 +72,20 @@ public:
     // Use '|' instead of '||' has better performance by test.
     ALWAYS_INLINE bool find(const T& value) const {
         DCHECK_EQ(N, _size);
-        if constexpr (N == 0) {
+        return _find_impl(value, std::make_index_sequence<N> {});
+    }
+
+private:
+    template <size_t... I>
+    ALWAYS_INLINE bool _find_impl(const T& value, std::index_sequence<I...>) 
const {
+        if constexpr (sizeof...(I) == 0) {
             return false;
+        } else {
+            return (... | (uint8_t)(Compare::equal(value, _data[I])));
         }
-        if constexpr (N == 1) {
-            return (Compare::equal(value, _data[0]));
-        }
-        if constexpr (N == 2) {
-            return (uint8_t)(Compare::equal(value, _data[0])) |
-                   (uint8_t)(Compare::equal(value, _data[1]));
-        }
-        if constexpr (N == 3) {
-            return (uint8_t)(Compare::equal(value, _data[0])) |
-                   (uint8_t)(Compare::equal(value, _data[1])) |
-                   (uint8_t)(Compare::equal(value, _data[2]));
-        }
-        if constexpr (N == 4) {
-            return (uint8_t)(Compare::equal(value, _data[0])) |
-                   (uint8_t)(Compare::equal(value, _data[1])) |
-                   (uint8_t)(Compare::equal(value, _data[2])) |
-                   (uint8_t)(Compare::equal(value, _data[3]));
-        }
-        if constexpr (N == 5) {
-            return (uint8_t)(Compare::equal(value, _data[0])) |
-                   (uint8_t)(Compare::equal(value, _data[1])) |
-                   (uint8_t)(Compare::equal(value, _data[2])) |
-                   (uint8_t)(Compare::equal(value, _data[3])) |
-                   (uint8_t)(Compare::equal(value, _data[4]));
-        }
-        if constexpr (N == 6) {
-            return (uint8_t)(Compare::equal(value, _data[0])) |
-                   (uint8_t)(Compare::equal(value, _data[1])) |
-                   (uint8_t)(Compare::equal(value, _data[2])) |
-                   (uint8_t)(Compare::equal(value, _data[3])) |
-                   (uint8_t)(Compare::equal(value, _data[4])) |
-                   (uint8_t)(Compare::equal(value, _data[5]));
-        }
-        if constexpr (N == 7) {
-            return (uint8_t)(Compare::equal(value, _data[0])) |
-                   (uint8_t)(Compare::equal(value, _data[1])) |
-                   (uint8_t)(Compare::equal(value, _data[2])) |
-                   (uint8_t)(Compare::equal(value, _data[3])) |
-                   (uint8_t)(Compare::equal(value, _data[4])) |
-                   (uint8_t)(Compare::equal(value, _data[5])) |
-                   (uint8_t)(Compare::equal(value, _data[6]));
-        }
-        if constexpr (N == FIXED_CONTAINER_MAX_SIZE) {
-            return (uint8_t)(Compare::equal(value, _data[0])) |
-                   (uint8_t)(Compare::equal(value, _data[1])) |
-                   (uint8_t)(Compare::equal(value, _data[2])) |
-                   (uint8_t)(Compare::equal(value, _data[3])) |
-                   (uint8_t)(Compare::equal(value, _data[4])) |
-                   (uint8_t)(Compare::equal(value, _data[5])) |
-                   (uint8_t)(Compare::equal(value, _data[6])) |
-                   (uint8_t)(Compare::equal(value, _data[7]));
-        }
-        CHECK(false) << "unreachable path";
-        return false;
     }
 
+public:
     size_t size() const { return _size; }
 
     class Iterator {
diff --git a/be/src/storage/predicate/in_list_predicate.h 
b/be/src/storage/predicate/in_list_predicate.h
index df41d24d6b4..8aec120bd94 100644
--- a/be/src/storage/predicate/in_list_predicate.h
+++ b/be/src/storage/predicate/in_list_predicate.h
@@ -67,14 +67,6 @@ class InListPredicateBase final : public ColumnPredicate {
 public:
     ENABLE_FACTORY_CREATOR(InListPredicateBase);
     using T = typename PrimitiveTypeTraits<Type>::CppType;
-    using HybridSetType = std::conditional_t<
-            N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE,
-            std::conditional_t<is_string_type(Type), 
StringSet<FixedContainer<std::string, N>>,
-                               HybridSet<Type, FixedContainer<T, N>,
-                                         
PredicateColumnType<PredicateEvaluateType<Type>>>>,
-            std::conditional_t<is_string_type(Type), 
StringSet<DynamicContainer<std::string>>,
-                               HybridSet<Type, DynamicContainer<T>,
-                                         
PredicateColumnType<PredicateEvaluateType<Type>>>>>;
     InListPredicateBase(uint32_t column_id, std::string col_name,
                         const std::shared_ptr<HybridSetBase>& hybrid_set, bool 
is_opposite,
                         size_t char_length = 0)
@@ -83,33 +75,39 @@ public:
               _max_value(type_limit<T>::min()) {
         CHECK(hybrid_set != nullptr);
 
-        if constexpr (is_string_type(Type) || Type == TYPE_DECIMALV2 || 
is_date_type(Type)) {
+        // String types need a copy because:
+        // 1. The caller's set is StringSet<DynamicContainer<std::string>>, 
but here we want
+        //    StringSet<FixedContainer<std::string, N>> for small-set 
optimization — different
+        //    C++ types, cannot share the pointer.
+        // 2. CHAR type additionally needs padding to char_length.
+        //
+        // Date/DECIMALV2 types do NOT need a copy: their ElementType 
(CppType) is identical
+        // between the caller's HybridSet and InListPredicateBase's, and 
InListPredicateBase
+        // only calls _values->find() / begin() / size() which are virtual and 
don't depend on
+        // the ColumnType template parameter difference.
+        if constexpr (is_string_type(Type)) {
+            using HybridSetType = std::conditional_t<N >= 1 && N <= 
FIXED_CONTAINER_MAX_SIZE,
+                                                     
StringSet<FixedContainer<std::string, N>>,
+                                                     
StringSet<DynamicContainer<std::string>>>;
             _values = std::make_shared<HybridSetType>(false);
-            if constexpr (is_string_type(Type)) {
-                HybridSetBase::IteratorBase* iter = hybrid_set->begin();
-                while (iter->has_next()) {
-                    const auto* value = (const StringRef*)(iter->get_value());
-                    if constexpr (Type == TYPE_CHAR) {
-                        _temp_datas.emplace_back("");
-                        _temp_datas.back().resize(std::max(char_length, 
value->size));
-                        memcpy(_temp_datas.back().data(), value->data, 
value->size);
-                        const std::string& str = _temp_datas.back();
-                        _values->insert((void*)str.data(), str.length());
-                    } else {
-                        _values->insert((void*)value->data, value->size);
-                    }
-                    iter->next();
-                }
-            } else {
-                HybridSetBase::IteratorBase* iter = hybrid_set->begin();
-                while (iter->has_next()) {
-                    const void* value = iter->get_value();
-                    _values->insert(value);
-                    iter->next();
+            HybridSetBase::IteratorBase* iter = hybrid_set->begin();
+            while (iter->has_next()) {
+                const auto* value = (const StringRef*)(iter->get_value());
+                if constexpr (Type == TYPE_CHAR) {
+                    _temp_datas.emplace_back("");
+                    _temp_datas.back().resize(std::max(char_length, 
value->size));
+                    memcpy(_temp_datas.back().data(), value->data, 
value->size);
+                    const std::string& str = _temp_datas.back();
+                    _values->insert((void*)str.data(), str.length());
+                } else {
+                    _values->insert((void*)value->data, value->size);
                 }
+                iter->next();
             }
         } else {
-            // shared from the caller, so it needs to be shared ptr
+            // Non-string types: directly share the caller's hybrid_set.
+            // The caller's set already contains the correct 
FixedContainer/DynamicContainer
+            // specialization; _values->find() dispatches to it via virtual 
call.
             _values = hybrid_set;
         }
         HybridSetBase::IteratorBase* iter = _values->begin();
diff --git a/be/src/storage/predicate/predicate_creator_in_list_in.cpp 
b/be/src/storage/predicate/predicate_creator_in_list_in.cpp
index 7b40a9adef6..e720da632a8 100644
--- a/be/src/storage/predicate/predicate_creator_in_list_in.cpp
+++ b/be/src/storage/predicate/predicate_creator_in_list_in.cpp
@@ -27,34 +27,42 @@ template <PrimitiveType TYPE, PredicateType PT>
 static std::shared_ptr<ColumnPredicate> create_in_list_predicate_impl(
         const uint32_t cid, const std::string col_name, const 
std::shared_ptr<HybridSetBase>& set,
         bool is_opposite, size_t char_length = 0) {
-    auto set_size = set->size();
-    if (set_size == 1) {
-        return InListPredicateBase<TYPE, PT, 1>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 2) {
-        return InListPredicateBase<TYPE, PT, 2>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 3) {
-        return InListPredicateBase<TYPE, PT, 3>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 4) {
-        return InListPredicateBase<TYPE, PT, 4>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 5) {
-        return InListPredicateBase<TYPE, PT, 5>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 6) {
-        return InListPredicateBase<TYPE, PT, 6>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 7) {
-        return InListPredicateBase<TYPE, PT, 7>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == FIXED_CONTAINER_MAX_SIZE) {
-        return InListPredicateBase<TYPE, PT, 8>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else {
+    // Only string types construct their own HybridSetType in the constructor 
(to convert
+    // from DynamicContainer to FixedContainer<std::string, N>), so N dispatch 
is only needed
+    // for them. All other types directly share the caller's hybrid_set.
+    if constexpr (!is_string_type(TYPE)) {
         return InListPredicateBase<TYPE, PT, FIXED_CONTAINER_MAX_SIZE + 
1>::create_shared(
                 cid, col_name, set, is_opposite, char_length);
+    } else {
+        auto set_size = set->size();
+        if (set_size == 1) {
+            return InListPredicateBase<TYPE, PT, 1>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 2) {
+            return InListPredicateBase<TYPE, PT, 2>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 3) {
+            return InListPredicateBase<TYPE, PT, 3>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 4) {
+            return InListPredicateBase<TYPE, PT, 4>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 5) {
+            return InListPredicateBase<TYPE, PT, 5>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 6) {
+            return InListPredicateBase<TYPE, PT, 6>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 7) {
+            return InListPredicateBase<TYPE, PT, 7>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == FIXED_CONTAINER_MAX_SIZE) {
+            return InListPredicateBase<TYPE, PT, 8>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else {
+            return InListPredicateBase<TYPE, PT, FIXED_CONTAINER_MAX_SIZE + 
1>::create_shared(
+                    cid, col_name, set, is_opposite, char_length);
+        }
     }
 }
 
diff --git a/be/src/storage/predicate/predicate_creator_in_list_not_in.cpp 
b/be/src/storage/predicate/predicate_creator_in_list_not_in.cpp
index be28f2fabc1..63e8eb37b18 100644
--- a/be/src/storage/predicate/predicate_creator_in_list_not_in.cpp
+++ b/be/src/storage/predicate/predicate_creator_in_list_not_in.cpp
@@ -27,34 +27,42 @@ template <PrimitiveType TYPE, PredicateType PT>
 static std::shared_ptr<ColumnPredicate> create_in_list_predicate_impl(
         const uint32_t cid, const std::string col_name, const 
std::shared_ptr<HybridSetBase>& set,
         bool is_opposite, size_t char_length = 0) {
-    auto set_size = set->size();
-    if (set_size == 1) {
-        return InListPredicateBase<TYPE, PT, 1>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 2) {
-        return InListPredicateBase<TYPE, PT, 2>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 3) {
-        return InListPredicateBase<TYPE, PT, 3>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 4) {
-        return InListPredicateBase<TYPE, PT, 4>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 5) {
-        return InListPredicateBase<TYPE, PT, 5>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 6) {
-        return InListPredicateBase<TYPE, PT, 6>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == 7) {
-        return InListPredicateBase<TYPE, PT, 7>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else if (set_size == FIXED_CONTAINER_MAX_SIZE) {
-        return InListPredicateBase<TYPE, PT, 8>::create_shared(cid, col_name, 
set, is_opposite,
-                                                               char_length);
-    } else {
+    // Only string types construct their own HybridSetType in the constructor 
(to convert
+    // from DynamicContainer to FixedContainer<std::string, N>), so N dispatch 
is only needed
+    // for them. All other types directly share the caller's hybrid_set.
+    if constexpr (!is_string_type(TYPE)) {
         return InListPredicateBase<TYPE, PT, FIXED_CONTAINER_MAX_SIZE + 
1>::create_shared(
                 cid, col_name, set, is_opposite, char_length);
+    } else {
+        auto set_size = set->size();
+        if (set_size == 1) {
+            return InListPredicateBase<TYPE, PT, 1>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 2) {
+            return InListPredicateBase<TYPE, PT, 2>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 3) {
+            return InListPredicateBase<TYPE, PT, 3>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 4) {
+            return InListPredicateBase<TYPE, PT, 4>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 5) {
+            return InListPredicateBase<TYPE, PT, 5>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 6) {
+            return InListPredicateBase<TYPE, PT, 6>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == 7) {
+            return InListPredicateBase<TYPE, PT, 7>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else if (set_size == FIXED_CONTAINER_MAX_SIZE) {
+            return InListPredicateBase<TYPE, PT, 8>::create_shared(cid, 
col_name, set, is_opposite,
+                                                                   
char_length);
+        } else {
+            return InListPredicateBase<TYPE, PT, FIXED_CONTAINER_MAX_SIZE + 
1>::create_shared(
+                    cid, col_name, set, is_opposite, char_length);
+        }
     }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to