github-actions[bot] commented on code in PR #25493:
URL: https://github.com/apache/doris/pull/25493#discussion_r1366384190
##########
be/src/vec/columns/column_struct.cpp:
##########
@@ -224,8 +224,8 @@ void
ColumnStruct::update_crcs_with_value(std::vector<uint64_t>& hash, Primitive
}
}
-void ColumnStruct::insert_indices_from(const IColumn& src, const int*
indices_begin,
- const int* indices_end) {
+void ColumnStruct::insert_indices_from(const IColumn& src, const uint32_t*
__restrict indices_begin,
Review Comment:
warning: method 'insert_indices_from' can be made static
[readability-convert-member-functions-to-static]
be/src/vec/columns/column_struct.h:122:
```diff
- void insert_indices_from(const IColumn& src, const uint32* __restrict
indices_begin,
+ static void insert_indices_from(const IColumn& src, const uint32*
__restrict indices_begin,
```
##########
be/src/vec/common/hash_table/hash_map.h:
##########
@@ -193,10 +196,155 @@ class HashMapTable : public HashTable<Key, Cell, Hash,
Grower, Allocator> {
bool has_null_key_data() const { return false; }
};
+template <typename Key, typename Cell, typename Hash = DefaultHash<Key>,
+ typename Grower = HashTableGrower<>, typename Allocator =
HashTableAllocator>
+class JoinHashMapTable : public HashMapTable<Key, Cell, Hash, Grower,
Allocator> {
+public:
+ using Self = JoinHashMapTable;
+ using Base = HashMapTable<Key, Cell, Hash, Grower, Allocator>;
+
+ using key_type = Key;
+ using value_type = typename Cell::value_type;
+ using mapped_type = typename Cell::Mapped;
+
+ using LookupResult = typename Base::LookupResult;
+
+ using HashMapTable<Key, Cell, Hash, Grower, Allocator>::HashMapTable;
+
+ static uint32_t calc_bucket_size(size_t num_elem) {
+ size_t expect_bucket_size = static_cast<size_t>(num_elem) + (num_elem
- 1) / 7;
+ return phmap::priv::NormalizeCapacity(expect_bucket_size) + 1;
+ }
+
+ void build(const Key* __restrict keys, const size_t* __restrict
hash_values, size_t num_elem,
+ int batch_size) {
+ max_batch_size = batch_size;
+ bucket_size = calc_bucket_size(num_elem + 1);
+ first.resize(bucket_size, 0);
+ next.resize(num_elem);
+
+ build_keys = keys;
+ for (size_t i = 1; i < num_elem; i++) {
+ uint32_t bucket_num = hash_values[i] & (bucket_size - 1);
+ next[i] = first[bucket_num];
+ first[bucket_num] = i;
+ }
+ }
+
+ template <int JoinOpType>
+ auto find_batch(const Key* __restrict keys, const size_t* __restrict
hash_values, int probe_idx,
+ int probe_rows, std::vector<uint32_t>& probe_idxs,
+ std::vector<uint32_t>& build_idxs) {
+ if constexpr (JoinOpType == doris::TJoinOp::INNER_JOIN ||
+ JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN) {
+ return _find_batch_inner_outer_join<JoinOpType>(keys, hash_values,
probe_idx,
+ probe_rows,
probe_idxs, build_idxs);
+ }
+ if constexpr (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
+ JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN) {
+ return _find_batch_left_semi_anti<JoinOpType>(keys, hash_values,
probe_idx, probe_rows,
+ probe_idxs);
+ }
+ return std::pair {0, 0};
+ }
+
+private:
+ template <int JoinOpType>
+ auto _find_batch_left_semi_anti(const Key* __restrict keys,
+ const size_t* __restrict hash_values, int
probe_idx,
+ int probe_rows, std::vector<uint32_t>&
probe_idxs) {
+ auto matched_cnt = 0;
+ const auto batch_size = max_batch_size;
+
+ while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {
Review Comment:
warning: boolean expression can be simplified by DeMorgan's theorem
[readability-simplify-boolean-expr]
```cpp
while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {
^
```
<details>
<summary>Additional context</summary>
**be/src/common/compiler_util.h:34:** expanded from macro 'LIKELY'
```cpp
#define LIKELY(expr) __builtin_expect(!!(expr), 1)
^
```
</details>
##########
be/src/vec/common/hash_table/hash_map.h:
##########
@@ -20,6 +20,9 @@
#pragma once
+#include <span>
+
+#include "gen_cpp/PlanNodes_types.h"
Review Comment:
warning: 'gen_cpp/PlanNodes_types.h' file not found [clang-diagnostic-error]
```cpp
#include "gen_cpp/PlanNodes_types.h"
^
```
##########
be/src/vec/core/block.cpp:
##########
@@ -944,7 +944,8 @@ void MutableBlock::add_row(const Block* block, int row) {
}
}
-void MutableBlock::add_rows(const Block* block, const int* row_begin, const
int* row_end) {
+void MutableBlock::add_rows(const Block* block, const uint32_t* row_begin,
Review Comment:
warning: method 'add_rows' can be made const
[readability-make-member-function-const]
be/src/vec/core/block.h:594:
```diff
- void add_rows(const Block* block, const uint32_t* row_begin, const
uint32_t* row_end);
+ void add_rows(const Block* block, const uint32_t* row_begin, const
uint32_t* row_end) const;
```
be/src/vec/core/block.cpp:947:
```diff
- const uint32_t* row_end) {
+ const uint32_t* row_end) const {
```
##########
be/src/vec/common/hash_table/hash_map.h:
##########
@@ -193,10 +196,155 @@ class HashMapTable : public HashTable<Key, Cell, Hash,
Grower, Allocator> {
bool has_null_key_data() const { return false; }
};
+template <typename Key, typename Cell, typename Hash = DefaultHash<Key>,
+ typename Grower = HashTableGrower<>, typename Allocator =
HashTableAllocator>
+class JoinHashMapTable : public HashMapTable<Key, Cell, Hash, Grower,
Allocator> {
+public:
+ using Self = JoinHashMapTable;
+ using Base = HashMapTable<Key, Cell, Hash, Grower, Allocator>;
+
+ using key_type = Key;
+ using value_type = typename Cell::value_type;
+ using mapped_type = typename Cell::Mapped;
+
+ using LookupResult = typename Base::LookupResult;
+
+ using HashMapTable<Key, Cell, Hash, Grower, Allocator>::HashMapTable;
+
+ static uint32_t calc_bucket_size(size_t num_elem) {
+ size_t expect_bucket_size = static_cast<size_t>(num_elem) + (num_elem
- 1) / 7;
+ return phmap::priv::NormalizeCapacity(expect_bucket_size) + 1;
+ }
+
+ void build(const Key* __restrict keys, const size_t* __restrict
hash_values, size_t num_elem,
+ int batch_size) {
+ max_batch_size = batch_size;
+ bucket_size = calc_bucket_size(num_elem + 1);
+ first.resize(bucket_size, 0);
+ next.resize(num_elem);
+
+ build_keys = keys;
+ for (size_t i = 1; i < num_elem; i++) {
+ uint32_t bucket_num = hash_values[i] & (bucket_size - 1);
+ next[i] = first[bucket_num];
+ first[bucket_num] = i;
+ }
+ }
+
+ template <int JoinOpType>
+ auto find_batch(const Key* __restrict keys, const size_t* __restrict
hash_values, int probe_idx,
+ int probe_rows, std::vector<uint32_t>& probe_idxs,
+ std::vector<uint32_t>& build_idxs) {
+ if constexpr (JoinOpType == doris::TJoinOp::INNER_JOIN ||
+ JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN) {
+ return _find_batch_inner_outer_join<JoinOpType>(keys, hash_values,
probe_idx,
+ probe_rows,
probe_idxs, build_idxs);
+ }
+ if constexpr (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
+ JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN) {
+ return _find_batch_left_semi_anti<JoinOpType>(keys, hash_values,
probe_idx, probe_rows,
+ probe_idxs);
+ }
+ return std::pair {0, 0};
+ }
+
+private:
+ template <int JoinOpType>
+ auto _find_batch_left_semi_anti(const Key* __restrict keys,
+ const size_t* __restrict hash_values, int
probe_idx,
+ int probe_rows, std::vector<uint32_t>&
probe_idxs) {
+ auto matched_cnt = 0;
+ const auto batch_size = max_batch_size;
+
+ while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {
+ uint32_t bucket_num = hash_values[probe_idx] & (bucket_size - 1);
+ auto build_idx = first[bucket_num];
+
+ while (build_idx) {
+ if (keys[probe_idx] == build_keys[build_idx]) {
+ break;
+ }
+ build_idx = next[build_idx];
+ }
+ const bool matched =
+ JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ? build_idx
!= 0 : build_idx == 0;
+ matched_cnt += matched;
+ probe_idxs[matched_cnt - matched] = probe_idx++;
+ }
+ return std::pair {probe_idx, matched_cnt};
+ }
+
+ template <int JoinOpType>
+ auto _find_batch_inner_outer_join(const Key* __restrict keys,
+ const size_t* __restrict hash_values,
int probe_idx,
+ int probe_rows, std::vector<uint32_t>&
probe_idxs,
+ std::vector<uint32_t>& build_idxs) {
+ auto matched_cnt = 0;
+ const auto batch_size = max_batch_size;
+ uint32_t build_idx = 0;
+
+ auto do_the_probe = [&]() {
+ while (build_idx && LIKELY(matched_cnt < batch_size)) {
+ if (keys[probe_idx] == build_keys[build_idx]) {
+ probe_idxs[matched_cnt] = probe_idx;
+ build_idxs[matched_cnt] = build_idx;
+ matched_cnt++;
+ }
+ build_idx = next[build_idx];
+ }
+
+ if constexpr (JoinOpType != doris::TJoinOp::INNER_JOIN) {
+ // `(!matched_cnt || probe_idxs[matched_cnt - 1] !=
probe_idx)` means not match one build side
+ if (!build_idx && (!matched_cnt || probe_idxs[matched_cnt - 1]
!= probe_idx)) {
+ probe_idxs[matched_cnt] = probe_idx;
+ build_idxs[matched_cnt] = build_idx;
+ matched_cnt++;
+ }
+ }
+
+ if (matched_cnt == max_batch_size && build_idx) {
+ current_probe_idx = probe_idx;
+ current_build_idx = build_idx;
+ } else {
+ probe_idx++;
+ }
+ };
+
+ // some row over the batch_size, need dispose first
+ if (probe_idx == current_probe_idx) {
+ current_probe_idx = -1;
+ build_idx = current_build_idx;
+ current_build_idx = 0;
+ do_the_probe();
+ }
+ while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {
Review Comment:
warning: boolean expression can be simplified by DeMorgan's theorem
[readability-simplify-boolean-expr]
```cpp
while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {
^
```
<details>
<summary>Additional context</summary>
**be/src/common/compiler_util.h:34:** expanded from macro 'LIKELY'
```cpp
#define LIKELY(expr) __builtin_expect(!!(expr), 1)
^
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]