This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 80eae07e043 branch-3.0:[fix](parquet/orc) Disable string dictionary 
filtering when predicate express is not binary pred and in pred (#50749) 
(#51267)
80eae07e043 is described below

commit 80eae07e0439983f2ae37329f0be3b1441127bf9
Author: Socrates <suyit...@selectdb.com>
AuthorDate: Wed Jun 11 10:52:58 2025 +0800

    branch-3.0:[fix](parquet/orc) Disable string dictionary filtering when 
predicate express is not binary pred and in pred (#50749) (#51267)
    
    cherry-pick: #50749
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         |  34 ++++++------------
 .../exec/format/parquet/vparquet_group_reader.cpp  |  38 +++++++--------------
 .../hive/test_string_dict_filter.out               | Bin 11747 -> 20131 bytes
 .../hive/test_string_dict_filter.groovy            |  36 +++++++++++++++++++
 4 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 193756bc64d..891ec81e992 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -2068,29 +2068,17 @@ bool OrcReader::_can_filter_by_dict(int slot_id) {
         return false;
     }
 
-    std::function<bool(const VExpr* expr)> visit_function_call = [&](const 
VExpr* expr) {
-        // TODO: The current implementation of dictionary filtering does not 
take into account
-        //  the implementation of NULL values because the dictionary itself 
does not contain
-        //  NULL value encoding. As a result, many NULL-related functions or 
expressions
-        //  cannot work properly, such as is null, is not null, coalesce, etc.
-        //  Here we first disable dictionary filtering when predicate expr is 
not slot.
-        //  Implementation of NULL value dictionary filtering will be carried 
out later.
-        if (expr->node_type() != TExprNodeType::SLOT_REF) {
-            return false;
-        }
-        for (auto& child : expr->children()) {
-            if (!visit_function_call(child.get())) {
-                return false;
-            }
-        }
-        return true;
-    };
-    for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
-        if (!visit_function_call(ctx->root().get())) {
-            return false;
-        }
-    }
-    return true;
+    // TODO: The current implementation of dictionary filtering does not take 
into account
+    //  the implementation of NULL values because the dictionary itself does 
not contain
+    //  NULL value encoding. As a result, many NULL-related functions or 
expressions
+    //  cannot work properly, such as is null, is not null, coalesce, etc.
+    //  Here we check if the predicate expr is IN or BINARY_PRED.
+    //  Implementation of NULL value dictionary filtering will be carried out 
later.
+    return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id), 
[&](const auto& ctx) {
+        return (ctx->root()->node_type() == TExprNodeType::IN_PRED ||
+                ctx->root()->node_type() == TExprNodeType::BINARY_PRED) &&
+               ctx->root()->children()[0]->node_type() == 
TExprNodeType::SLOT_REF;
+    });
 }
 
 Status OrcReader::on_string_dicts_loaded(
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index a18626066b1..5c5489d3f86 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -199,37 +199,25 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id,
         return false;
     }
 
-    if (_slot_id_to_filter_conjuncts->find(slot_id) == 
_slot_id_to_filter_conjuncts->end()) {
+    if (!is_dictionary_encoded(column_metadata)) {
         return false;
     }
 
-    if (!is_dictionary_encoded(column_metadata)) {
+    if (_slot_id_to_filter_conjuncts->find(slot_id) == 
_slot_id_to_filter_conjuncts->end()) {
         return false;
     }
 
-    std::function<bool(const VExpr* expr)> visit_function_call = [&](const 
VExpr* expr) {
-        // TODO: The current implementation of dictionary filtering does not 
take into account
-        //  the implementation of NULL values because the dictionary itself 
does not contain
-        //  NULL value encoding. As a result, many NULL-related functions or 
expressions
-        //  cannot work properly, such as is null, is not null, coalesce, etc.
-        //  Here we first disable dictionary filtering when predicate is not 
slot.
-        //  Implementation of NULL value dictionary filtering will be carried 
out later.
-        if (expr->node_type() != TExprNodeType::SLOT_REF) {
-            return false;
-        }
-        for (auto& child : expr->children()) {
-            if (!visit_function_call(child.get())) {
-                return false;
-            }
-        }
-        return true;
-    };
-    for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) {
-        if (!visit_function_call(ctx->root().get())) {
-            return false;
-        }
-    }
-    return true;
+    // TODO: The current implementation of dictionary filtering does not take 
into account
+    //  the implementation of NULL values because the dictionary itself does 
not contain
+    //  NULL value encoding. As a result, many NULL-related functions or 
expressions
+    //  cannot work properly, such as is null, is not null, coalesce, etc.
+    //  Here we check if the predicate expr is IN or BINARY_PRED.
+    //  Implementation of NULL value dictionary filtering will be carried out 
later.
+    return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id), 
[&](const auto& ctx) {
+        return (ctx->root()->node_type() == TExprNodeType::IN_PRED ||
+                ctx->root()->node_type() == TExprNodeType::BINARY_PRED) &&
+               ctx->root()->children()[0]->node_type() == 
TExprNodeType::SLOT_REF;
+    });
 }
 
 // This function is copied from
diff --git 
a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out 
b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out
index 2a8cebd8723..97b1fb2ff4c 100644
Binary files 
a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out and 
b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy 
b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
index 1929c813c55..18e62570ad7 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy
@@ -62,6 +62,24 @@ suite("test_string_dict_filter", 
"p0,external,hive,external_docker,external_dock
         qt_q15 """
         select count(o_orderpriority) from ( select (case when o_orderpriority 
= 'x' then '1' when o_orderpriority = 'y' then '2' else '0' end) as 
o_orderpriority from test_string_dict_filter_parquet ) as A where 
o_orderpriority = '0';
         """
+        qt_q16 """
+        select * from test_string_dict_filter_parquet where cast(o_orderstatus 
as string) = 'F';
+        """
+        qt_q17 """
+        select * from test_string_dict_filter_parquet where cast(o_orderstatus 
as string) = 'O';
+        """
+        qt_q18 """
+        select * from test_string_dict_filter_parquet where cast(o_orderstatus 
as string) in ('O', 'F');
+        """
+        qt_q19 """
+        select * from test_string_dict_filter_parquet where 
cast(o_orderpriority as string) is null;
+        """
+        qt_q20 """
+        select * from test_string_dict_filter_parquet where 
cast(o_orderpriority as string) is not null;
+        """
+        qt_q21 """
+        select * from test_string_dict_filter_parquet where 
cast(o_orderpriority as string) in ('5-LOW', NULL);
+        """
     }
     def q_orc = {
         qt_q01 """
@@ -109,6 +127,24 @@ suite("test_string_dict_filter", 
"p0,external,hive,external_docker,external_dock
         qt_q15 """
         select count(o_orderpriority) from ( select (case when o_orderpriority 
= 'x' then '1' when o_orderpriority = 'y' then '2' else '0' end) as 
o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = 
'0';
         """
+        qt_q16 """
+        select * from test_string_dict_filter_orc where cast(o_orderstatus as 
string) = 'F';
+        """
+        qt_q17 """
+        select * from test_string_dict_filter_orc where cast(o_orderstatus as 
string) = 'O';
+        """
+        qt_q18 """
+        select * from test_string_dict_filter_orc where cast(o_orderstatus as 
string) in ('O', 'F');
+        """
+        qt_q19 """
+        select * from test_string_dict_filter_orc where cast(o_orderpriority 
as string) is null;
+        """
+        qt_q20 """
+        select * from test_string_dict_filter_orc where cast(o_orderpriority 
as string) is not null;
+        """
+        qt_q21 """
+        select * from test_string_dict_filter_orc where cast(o_orderpriority 
as string) in ('5-LOW', NULL);
+        """
     }
     String enabled = context.config.otherConfigs.get("enableHiveTest")
     if (enabled == null || !enabled.equalsIgnoreCase("true")) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to