This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 1006a78487e5cbe84f202d99f210860e78d1996f
Author: Qi Chen <[email protected]>
AuthorDate: Fri Aug 25 21:16:43 2023 +0800

    [Fix](multi-catalog) Fix hive incorrect result by disable string dict 
filter if exprs contain null expr. (#23361)
    
    Issue Number: close #21960
    
    Fix hive incorrect result by disable string dict filter if exprs contain 
null expr.
---
 be/src/vec/exec/format/orc/vorc_reader.cpp           | 19 +++++++++++++++----
 .../exec/format/parquet/vparquet_group_reader.cpp    | 20 ++++++++++++++++----
 .../hive/test_external_catalog_hive.out              |  6 ++++++
 .../hive/test_external_catalog_hive.groovy           |  5 +++++
 4 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index a1b1aa34b5..99aa9cf5a3 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -1754,15 +1754,26 @@ bool OrcReader::_can_filter_by_dict(int slot_id) {
     }
 
     // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by 
dict.
-    for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
-        const auto& root_expr = ctx->root();
-        if (root_expr->node_type() == TExprNodeType::FUNCTION_CALL) {
+    std::function<bool(const VExpr* expr)> visit_function_call = [&](const 
VExpr* expr) {
+        if (expr->node_type() == TExprNodeType::FUNCTION_CALL) {
             std::string is_null_str;
-            std::string function_name = root_expr->fn().name.function_name;
+            std::string function_name = expr->fn().name.function_name;
             if (function_name.compare("is_null_pred") == 0 ||
                 function_name.compare("is_not_null_pred") == 0) {
                 return false;
             }
+        } else {
+            for (auto& child : expr->children()) {
+                if (!visit_function_call(child.get())) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    };
+    for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
+        if (!visit_function_call(ctx->root().get())) {
+            return false;
         }
     }
     return true;
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index d3aa4c3cad..661fcd24f6 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -197,19 +197,31 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id,
     }
 
     // TODO:check expr like 'a > 10 is null', 'a > 10' should can be filter by 
dict.
-    for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
-        const auto& root_expr = ctx->root();
-        if (root_expr->node_type() == TExprNodeType::FUNCTION_CALL) {
+    std::function<bool(const VExpr* expr)> visit_function_call = [&](const 
VExpr* expr) {
+        if (expr->node_type() == TExprNodeType::FUNCTION_CALL) {
             std::string is_null_str;
-            std::string function_name = root_expr->fn().name.function_name;
+            std::string function_name = expr->fn().name.function_name;
             if (function_name.compare("is_null_pred") == 0 ||
                 function_name.compare("is_not_null_pred") == 0) {
                 return false;
             }
+        } else {
+            for (auto& child : expr->children()) {
+                if (!visit_function_call(child.get())) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    };
+    for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
+        if (!visit_function_call(ctx->root().get())) {
+            return false;
         }
     }
     return true;
 }
+
 // This function is copied from
 // 
https://github.com/apache/impala/blob/master/be/src/exec/parquet/hdfs-parquet-scanner.cc#L1717
 bool RowGroupReader::is_dictionary_encoded(const tparquet::ColumnMetaData& 
column_metadata) {
diff --git 
a/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out 
b/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out
index ae29339cec..0cb7db4732 100644
--- a/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out
+++ b/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out
@@ -120,3 +120,9 @@ Z6n2t4XA2n7CXTECJ,PE,iBbsCh0RE1Dd2A,z48
 \N     2073732 2       13846443        596483.00       21.00   29163.75        
0.10    0.08    R       F       1994-12-06      1995-01-01      DELIVER IN 
PERSON       FOB     dolphins nag furiously q
 \N     2479044 4       9763795 13805.00        40.00   74332.40        0.05    
0.05    R       F       1994-11-16      1995-01-01      COLLECT COD     RAIL    
equests hinder qu
 
+-- !null_expr_dict_filter_orc --
+4844   4363
+
+-- !null_expr_dict_filter_parquet --
+4844   4363
+
diff --git 
a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy
 
b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy
index e0a56e89c6..03d5cda5f1 100644
--- 
a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy
+++ 
b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy
@@ -93,6 +93,11 @@ suite("test_external_catalog_hive", "p2") {
         qt_not_single_slot_filter_conjuncts_orc """ select * from 
multi_catalog.lineitem_string_date_orc where l_commitdate < l_receiptdate and 
l_receiptdate = '1995-01-01'  order by l_orderkey, l_partkey, l_suppkey, 
l_linenumber limit 10; """
         qt_not_single_slot_filter_conjuncts_parquet """ select * from 
multi_catalog.lineitem_string_date_orc where l_commitdate < l_receiptdate and 
l_receiptdate = '1995-01-01'  order by l_orderkey, l_partkey, l_suppkey, 
l_linenumber limit 10; """
 
+        // test null expr with dict filter issue
+        qt_null_expr_dict_filter_orc """ select count(*), count(distinct 
user_no) from multi_catalog.dict_fitler_test_orc WHERE partitions in 
('2023-08-21') and actual_intf_type  =  'type1' and (REUSE_FLAG<> 'y' or 
REUSE_FLAG is null); """
+        qt_null_expr_dict_filter_parquet """ select count(*), count(distinct 
user_no) from multi_catalog.dict_fitler_test_parquet WHERE partitions in 
('2023-08-21') and actual_intf_type  =  'type1' and (REUSE_FLAG<> 'y' or 
REUSE_FLAG is null); """
+
+
         // test remember last used database after switch / rename catalog
         sql """switch ${catalog_name};"""
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to