This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new b7ae7a07c7c [fix](join) incorrect result of left semi/anti join with
empty build side (#28898)
b7ae7a07c7c is described below
commit b7ae7a07c7caaf69ad188c1905407dba5fb34811
Author: Jerry Hu <[email protected]>
AuthorDate: Mon Dec 25 09:07:38 2023 +0800
[fix](join) incorrect result of left semi/anti join with empty build side
(#28898)
---
be/src/vec/common/hash_table/hash_map.h | 35 ++++++++++++++++++++++
.../test_null_aware_left_anti_join.out | 7 +++++
.../test_null_aware_left_anti_join.groovy | 18 ++++++++---
3 files changed, 56 insertions(+), 4 deletions(-)
diff --git a/be/src/vec/common/hash_table/hash_map.h
b/be/src/vec/common/hash_table/hash_map.h
index 6efbdbb3e94..cb2809492ae 100644
--- a/be/src/vec/common/hash_table/hash_map.h
+++ b/be/src/vec/common/hash_table/hash_map.h
@@ -226,6 +226,9 @@ public:
template <int JoinOpType>
void prepare_build(size_t num_elem, int batch_size, bool has_null_key) {
_has_null_key = has_null_key;
+
+ // the first row in build side is not really from build side table
+ _empty_build_side = num_elem <= 1;
max_batch_size = batch_size;
bucket_size = calc_bucket_size(num_elem + 1);
first.resize(bucket_size + 1);
@@ -262,6 +265,14 @@ public:
uint32_t* __restrict probe_idxs, bool& probe_visited,
uint32_t* __restrict build_idxs,
doris::vectorized::ColumnFilterHelper* mark_column) {
+ if constexpr (JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN)
{
+ if (_empty_build_side) {
+ return _process_null_aware_left_anti_join_for_empty_build_side<
+ JoinOpType, with_other_conjuncts, is_mark_join>(
+ probe_idx, probe_rows, probe_idxs, build_idxs,
mark_column);
+ }
+ }
+
if constexpr (is_mark_join) {
return _find_batch_mark<JoinOpType, with_other_conjuncts>(
keys, build_idx_map, probe_idx, probe_rows, probe_idxs,
build_idxs,
@@ -367,6 +378,29 @@ private:
return std::tuple {probe_idx, 0U, matched_cnt};
}
+ template <int JoinOpType, bool with_other_conjuncts, bool is_mark_join>
+ auto _process_null_aware_left_anti_join_for_empty_build_side(
+ int probe_idx, int probe_rows, uint32_t* __restrict probe_idxs,
+ uint32_t* __restrict build_idxs,
doris::vectorized::ColumnFilterHelper* mark_column) {
+ static_assert(JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN);
+ auto matched_cnt = 0;
+ const auto batch_size = max_batch_size;
+
+ while (probe_idx < probe_rows && matched_cnt < batch_size) {
+ probe_idxs[matched_cnt] = probe_idx++;
+ if constexpr (is_mark_join) {
+ build_idxs[matched_cnt] = 0;
+ }
+ ++matched_cnt;
+ }
+
+ if constexpr (is_mark_join && !with_other_conjuncts) {
+ mark_column->resize_fill(matched_cnt, 1);
+ }
+
+ return std::tuple {probe_idx, 0U, matched_cnt};
+ }
+
auto _find_batch_right_semi_anti(const Key* __restrict keys,
const uint32_t* __restrict build_idx_map,
int probe_idx,
int probe_rows) {
@@ -532,6 +566,7 @@ private:
Cell cell;
doris::vectorized::Arena* pool;
bool _has_null_key = false;
+ bool _empty_build_side = true;
};
template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>,
diff --git
a/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out
b/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out
index d33e4e2947f..09d7d231709 100644
--- a/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out
+++ b/regression-test/data/correctness_p0/test_null_aware_left_anti_join.out
@@ -9,3 +9,10 @@
-- !select --
+-- !anti_emtpy_right --
+\N
+1
+3
+
+-- !semi_emtpy_right --
+
diff --git
a/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy
b/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy
index f732b6bda58..6083290b2e5 100644
---
a/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy
+++
b/regression-test/suites/correctness_p0/test_null_aware_left_anti_join.groovy
@@ -60,11 +60,21 @@ suite("test_null_aware_left_anti_join") {
sql """ set parallel_pipeline_task_num=2; """
qt_select """ select ${tableName2}.k1 from ${tableName2} where k1 not in
(select ${tableName1}.k1 from ${tableName1}) order by ${tableName2}.k1; """
- sql """
- drop table if exists ${tableName2};
+ // In left anti join, if right side is empty, all rows(null included) of
left should be output.
+ qt_anti_emtpy_right """
+ select
+ *
+ from ${tableName1} t1 where k1 not in (
+ select k1 from ${tableName2} t2 where t2.k1 > 2
+ ) order by 1;
"""
- sql """
- drop table if exists ${tableName1};
+ // In left semi join, if right side is empty, no row should be output.
+ qt_semi_emtpy_right """
+ select
+ *
+ from ${tableName1} t1 where k1 in (
+ select k1 from ${tableName2} t2 where t2.k1 > 2
+ ) order by 1;
"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]