This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a2a06c91a50 [refine](hashmap)The iterator for the agg hashmap should 
not yield NULL values. (#61113)
a2a06c91a50 is described below

commit a2a06c91a5076ea9c482ac591ece7cc6c4af7205
Author: Mryange <[email protected]>
AuthorDate: Wed Mar 11 14:04:06 2026 +0800

    [refine](hashmap)The iterator for the agg hashmap should not yield NULL 
values. (#61113)
    
    ### What problem does this PR solve?
    
    In the PR https://github.com/apache/doris/pull/51063, because the
    hashmap did not handle NULL values in one operation, it caused a
    correctness issue. To fix it, the iterator for a hashmap that contains
    NULL may return NULL values. However, this is not ideal. A hashmap can
    contain at most one NULL key, and our usual approach is to handle this
    NULL key either at the end or at the beginning. In other cases, there is
    no need to consider NULL, and this is also better for performance.
    
    This PR ensures that the iterator for the agg hashmap should not yield
    NULL values.
    
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [x] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [x] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [x] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/exec/common/hash_table/hash_map_context.h | 58 +-------------
 be/src/exec/operator/set_probe_sink_operator.cpp | 39 ++++++++++
 be/src/exec/operator/set_source_operator.cpp     | 16 +++-
 be/src/exec/operator/set_source_operator.h       |  1 +
 be/test/exec/hash_map/hash_table_method_test.cpp | 96 +++++++++++++++++++++++-
 5 files changed, 150 insertions(+), 60 deletions(-)

diff --git a/be/src/exec/common/hash_table/hash_map_context.h 
b/be/src/exec/common/hash_table/hash_map_context.h
index e44c752ad0b..41e718fbd41 100644
--- a/be/src/exec/common/hash_table/hash_map_context.h
+++ b/be/src/exec/common/hash_table/hash_map_context.h
@@ -777,7 +777,7 @@ struct MethodKeysFixed : public MethodBase<TData> {
 };
 
 template <typename Base>
-struct DataWithNullKeyImpl : public Base {
+struct DataWithNullKey : public Base {
     bool& has_null_key_data() { return has_null_key; }
     bool has_null_key_data() const { return has_null_key; }
     template <typename MappedType>
@@ -802,62 +802,6 @@ protected:
     Base::Value null_key_data;
 };
 
-template <typename Base>
-struct DataWithNullKey : public DataWithNullKeyImpl<Base> {};
-
-template <IteratoredMap Base>
-struct DataWithNullKey<Base> : public DataWithNullKeyImpl<Base> {
-    using DataWithNullKeyImpl<Base>::null_key_data;
-    using DataWithNullKeyImpl<Base>::has_null_key;
-
-    struct Iterator {
-        typename Base::iterator base_iterator = {};
-        bool current_null = false;
-        Base::Value* null_key_data = nullptr;
-
-        Iterator() = default;
-        Iterator(typename Base::iterator it, bool null, Base::Value* null_key)
-                : base_iterator(it), current_null(null), 
null_key_data(null_key) {}
-        bool operator==(const Iterator& rhs) const {
-            return current_null == rhs.current_null && base_iterator == 
rhs.base_iterator;
-        }
-
-        bool operator!=(const Iterator& rhs) const { return !(*this == rhs); }
-
-        Iterator& operator++() {
-            if (current_null) {
-                current_null = false;
-            } else {
-                ++base_iterator;
-            }
-            return *this;
-        }
-
-        Base::Value& get_second() {
-            if (current_null) {
-                return *null_key_data;
-            } else {
-                return base_iterator->get_second();
-            }
-        }
-    };
-
-    Iterator begin() { return {Base::begin(), has_null_key, &null_key_data}; }
-
-    Iterator end() { return {Base::end(), false, &null_key_data}; }
-
-    void insert(const Iterator& other_iter) {
-        if (other_iter.current_null) {
-            has_null_key = true;
-            null_key_data = *other_iter.null_key_data;
-        } else {
-            Base::insert(other_iter.base_iterator);
-        }
-    }
-
-    using iterator = Iterator;
-};
-
 /// Single low cardinality column.
 template <typename SingleColumnMethod>
 struct MethodSingleNullableColumn : public SingleColumnMethod {
diff --git a/be/src/exec/operator/set_probe_sink_operator.cpp 
b/be/src/exec/operator/set_probe_sink_operator.cpp
index 26913e97641..32f61f73308 100644
--- a/be/src/exec/operator/set_probe_sink_operator.cpp
+++ b/be/src/exec/operator/set_probe_sink_operator.cpp
@@ -230,6 +230,35 @@ void 
SetProbeSinkOperatorX<is_intersect>::_refresh_hash_table(
                                 std::make_shared<typename 
HashTableCtxType::HashMapType>();
                         tmp_hash_table->reserve(
                                 
local_state._shared_state->valid_element_in_hash_tbl);
+
+                        // Handle null key separately since iterator does not 
cover it
+                        using NullMappedType =
+                                std::decay_t<decltype(arg.hash_table->template 
get_null_key_data<
+                                                      RowRefWithFlag>())>;
+                        if constexpr (std::is_same_v<NullMappedType, 
RowRefWithFlag>) {
+                            if (arg.hash_table->has_null_key_data()) {
+                                auto& null_mapped =
+                                        arg.hash_table
+                                                ->template 
get_null_key_data<RowRefWithFlag>();
+                                if constexpr (is_intersect) {
+                                    if (null_mapped.visited) {
+                                        null_mapped.visited = false;
+                                        tmp_hash_table->has_null_key_data() = 
true;
+                                        tmp_hash_table
+                                                ->template 
get_null_key_data<RowRefWithFlag>() =
+                                                null_mapped;
+                                    }
+                                } else {
+                                    if (!null_mapped.visited) {
+                                        tmp_hash_table->has_null_key_data() = 
true;
+                                        tmp_hash_table
+                                                ->template 
get_null_key_data<RowRefWithFlag>() =
+                                                null_mapped;
+                                    }
+                                }
+                            }
+                        }
+
                         while (iter != arg.end) {
                             auto& mapped = iter.get_second();
                             auto* it = &mapped;
@@ -249,6 +278,16 @@ void 
SetProbeSinkOperatorX<is_intersect>::_refresh_hash_table(
                         arg.hash_table = std::move(tmp_hash_table);
                     } else if (is_intersect) {
                         DCHECK_EQ(valid_element_in_hash_tbl, 
arg.hash_table->size());
+                        // Reset null key's visited flag separately
+                        using NullMappedType =
+                                std::decay_t<decltype(arg.hash_table->template 
get_null_key_data<
+                                                      RowRefWithFlag>())>;
+                        if constexpr (std::is_same_v<NullMappedType, 
RowRefWithFlag>) {
+                            if (arg.hash_table->has_null_key_data()) {
+                                arg.hash_table->template 
get_null_key_data<RowRefWithFlag>()
+                                        .visited = false;
+                            }
+                        }
                         while (iter != arg.end) {
                             auto& mapped = iter.get_second();
                             auto* it = &mapped;
diff --git a/be/src/exec/operator/set_source_operator.cpp 
b/be/src/exec/operator/set_source_operator.cpp
index cd1ac4bc45a..55defd9dacc 100644
--- a/be/src/exec/operator/set_source_operator.cpp
+++ b/be/src/exec/operator/set_source_operator.cpp
@@ -145,13 +145,25 @@ Status 
SetSourceOperatorX<is_intersect>::_get_data_in_hashtable(
         }
     };
 
+    // Output null key first (if present and not yet output)
+    if (!local_state._null_key_output && 
hash_table_ctx.hash_table->has_null_key_data()) {
+        auto value = hash_table_ctx.hash_table->template 
get_null_key_data<RowRefWithFlag>();
+        static_assert(std::is_same_v<RowRefWithFlag, 
std::decay_t<decltype(value)>> ||
+                      std::is_same_v<char*, std::decay_t<decltype(value)>>);
+        if constexpr (std::is_same_v<RowRefWithFlag, 
std::decay_t<decltype(value)>>) {
+            add_result(value);
+        }
+        local_state._null_key_output = true;
+    }
+
     auto& iter = hash_table_ctx.begin;
-    while (iter != hash_table_ctx.end && local_state._result_indexs.size() < 
batch_size) {
+    while (iter != hash_table_ctx.hash_table->end() &&
+           local_state._result_indexs.size() < batch_size) {
         add_result(iter.get_second());
         ++iter;
     }
 
-    *eos = iter == hash_table_ctx.end;
+    *eos = iter == hash_table_ctx.hash_table->end();
 
     COUNTER_UPDATE(local_state._get_data_from_hashtable_rows, 
local_state._result_indexs.size());
     local_state._add_result_columns();
diff --git a/be/src/exec/operator/set_source_operator.h 
b/be/src/exec/operator/set_source_operator.h
index 31e5fc77542..f2f245f1edc 100644
--- a/be/src/exec/operator/set_source_operator.h
+++ b/be/src/exec/operator/set_source_operator.h
@@ -51,6 +51,7 @@ private:
     RuntimeProfile::Counter* _filter_timer = nullptr;
     RuntimeProfile::Counter* _get_data_from_hashtable_rows = nullptr;
     IColumn::Selector _result_indexs;
+    bool _null_key_output = false;
 };
 
 template <bool is_intersect>
diff --git a/be/test/exec/hash_map/hash_table_method_test.cpp 
b/be/test/exec/hash_map/hash_table_method_test.cpp
index a7d93d7780f..60cc2284aa2 100644
--- a/be/test/exec/hash_map/hash_table_method_test.cpp
+++ b/be/test/exec/hash_map/hash_table_method_test.cpp
@@ -17,6 +17,8 @@
 
 #include <gtest/gtest.h>
 
+#include <set>
+
 #include "core/data_type/data_type_number.h"
 #include "exec/common/columns_hashing.h"
 #include "exec/common/hash_table/hash.h"
@@ -127,4 +129,96 @@ TEST(HashTableMethodTest, testMethodStringNoCache) {
               {0, 1, -1, 3, -1, 4});
 }
 
-} // namespace doris
\ No newline at end of file
+// Verify that iterating a DataWithNullKey hash map via 
init_iterator()/begin/end
+// does NOT visit the null key entry. The null key must be accessed separately
+// through has_null_key_data()/get_null_key_data().
+TEST(HashTableMethodTest, testNullableIteratorSkipsNullKey) {
+    using NullableMethod = MethodSingleNullableColumn<MethodOneNumber<
+            UInt32, DataWithNullKey<PHHashMap<UInt32, IColumn::ColumnIndex, 
HashCRC32<UInt32>>>>>;
+    NullableMethod method;
+
+    // data: {1, 0(null), 2, 0(null), 3}
+    // null_map: {0, 1, 0, 1, 0} — positions 1 and 3 are null
+    auto nullable_col =
+            ColumnHelper::create_nullable_column<DataTypeInt32>({1, 0, 2, 0, 
3}, {0, 1, 0, 1, 0});
+
+    // Insert all rows including nulls
+    {
+        using State = typename NullableMethod::State;
+        ColumnRawPtrs key_raw_columns {nullable_col.get()};
+        State state(key_raw_columns);
+        const size_t rows = nullable_col->size();
+        method.init_serialized_keys(key_raw_columns, rows);
+
+        for (size_t i = 0; i < rows; i++) {
+            IColumn::ColumnIndex mapped_value = i;
+            auto creator = [&](const auto& ctor, auto& key, auto& origin) {
+                ctor(key, mapped_value);
+            };
+            auto creator_for_null_key = [&](auto& mapped) { mapped = 
mapped_value; };
+            method.lazy_emplace(state, i, creator, creator_for_null_key);
+        }
+    }
+
+    // hash_table->size() includes null key: 3 non-null + 1 null = 4
+    EXPECT_EQ(method.hash_table->size(), 4);
+
+    // The underlying hash map (excluding null) has 3 entries
+    EXPECT_TRUE(method.hash_table->has_null_key_data());
+
+    // Iterate via init_iterator — should only visit 3 non-null entries
+    method.init_iterator();
+    size_t iter_count = 0;
+    std::set<IColumn::ColumnIndex> visited_values;
+    auto iter = method.begin;
+    while (iter != method.end) {
+        visited_values.insert(iter.get_second());
+        ++iter;
+        ++iter_count;
+    }
+
+    // Iterator must visit exactly 3 entries (the non-null keys: 1, 2, 3)
+    EXPECT_EQ(iter_count, 3);
+    // Mapped values for non-null rows are 0 (key=1), 2 (key=2), 4 (key=3)
+    EXPECT_TRUE(visited_values.count(0)); // row 0: key=1
+    EXPECT_TRUE(visited_values.count(2)); // row 2: key=2
+    EXPECT_TRUE(visited_values.count(4)); // row 4: key=3
+    // The null key's mapped value (1, from the first null row) must NOT 
appear in iteration
+    EXPECT_FALSE(visited_values.count(1));
+
+    // Null key must be accessible separately
+    auto null_mapped = method.hash_table->template 
get_null_key_data<IColumn::ColumnIndex>();
+    EXPECT_EQ(null_mapped, 1); // first null insertion at row 1
+
+    // find should locate null keys correctly
+    {
+        using State = typename NullableMethod::State;
+        // Search: {1, null, 99, 2}
+        auto search_col =
+                ColumnHelper::create_nullable_column<DataTypeInt32>({1, 0, 99, 
2}, {0, 1, 0, 0});
+        ColumnRawPtrs key_raw_columns {search_col.get()};
+        State state(key_raw_columns);
+        method.init_serialized_keys(key_raw_columns, 4);
+
+        // key=1 found
+        auto r0 = method.find(state, 0);
+        EXPECT_TRUE(r0.is_found());
+        EXPECT_EQ(r0.get_mapped(), 0);
+
+        // null found
+        auto r1 = method.find(state, 1);
+        EXPECT_TRUE(r1.is_found());
+        EXPECT_EQ(r1.get_mapped(), 1);
+
+        // key=99 not found
+        auto r2 = method.find(state, 2);
+        EXPECT_FALSE(r2.is_found());
+
+        // key=2 found
+        auto r3 = method.find(state, 3);
+        EXPECT_TRUE(r3.is_found());
+        EXPECT_EQ(r3.get_mapped(), 2);
+    }
+}
+
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to