This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 80eae07e043 branch-3.0:[fix](parquet/orc) Disable string dictionary filtering when predicate express is not binary pred and in pred (#50749) (#51267) 80eae07e043 is described below commit 80eae07e0439983f2ae37329f0be3b1441127bf9 Author: Socrates <suyit...@selectdb.com> AuthorDate: Wed Jun 11 10:52:58 2025 +0800 branch-3.0:[fix](parquet/orc) Disable string dictionary filtering when predicate express is not binary pred and in pred (#50749) (#51267) cherry-pick: #50749 --- be/src/vec/exec/format/orc/vorc_reader.cpp | 34 ++++++------------ .../exec/format/parquet/vparquet_group_reader.cpp | 38 +++++++-------------- .../hive/test_string_dict_filter.out | Bin 11747 -> 20131 bytes .../hive/test_string_dict_filter.groovy | 36 +++++++++++++++++++ 4 files changed, 60 insertions(+), 48 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 193756bc64d..891ec81e992 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -2068,29 +2068,17 @@ bool OrcReader::_can_filter_by_dict(int slot_id) { return false; } - std::function<bool(const VExpr* expr)> visit_function_call = [&](const VExpr* expr) { - // TODO: The current implementation of dictionary filtering does not take into account - // the implementation of NULL values because the dictionary itself does not contain - // NULL value encoding. As a result, many NULL-related functions or expressions - // cannot work properly, such as is null, is not null, coalesce, etc. - // Here we first disable dictionary filtering when predicate expr is not slot. - // Implementation of NULL value dictionary filtering will be carried out later. - if (expr->node_type() != TExprNodeType::SLOT_REF) { - return false; - } - for (auto& child : expr->children()) { - if (!visit_function_call(child.get())) { - return false; - } - } - return true; - }; - for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) { - if (!visit_function_call(ctx->root().get())) { - return false; - } - } - return true; + // TODO: The current implementation of dictionary filtering does not take into account + // the implementation of NULL values because the dictionary itself does not contain + // NULL value encoding. As a result, many NULL-related functions or expressions + // cannot work properly, such as is null, is not null, coalesce, etc. + // Here we check if the predicate expr is IN or BINARY_PRED. + // Implementation of NULL value dictionary filtering will be carried out later. + return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id), [&](const auto& ctx) { + return (ctx->root()->node_type() == TExprNodeType::IN_PRED || + ctx->root()->node_type() == TExprNodeType::BINARY_PRED) && + ctx->root()->children()[0]->node_type() == TExprNodeType::SLOT_REF; + }); } Status OrcReader::on_string_dicts_loaded( diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index a18626066b1..5c5489d3f86 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -199,37 +199,25 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id, return false; } - if (_slot_id_to_filter_conjuncts->find(slot_id) == _slot_id_to_filter_conjuncts->end()) { + if (!is_dictionary_encoded(column_metadata)) { return false; } - if (!is_dictionary_encoded(column_metadata)) { + if (_slot_id_to_filter_conjuncts->find(slot_id) == _slot_id_to_filter_conjuncts->end()) { return false; } - std::function<bool(const VExpr* expr)> visit_function_call = [&](const VExpr* expr) { - // TODO: The current implementation of dictionary filtering does not take into account - // the implementation of NULL values because the dictionary itself does not contain - // NULL value encoding. As a result, many NULL-related functions or expressions - // cannot work properly, such as is null, is not null, coalesce, etc. - // Here we first disable dictionary filtering when predicate is not slot. - // Implementation of NULL value dictionary filtering will be carried out later. - if (expr->node_type() != TExprNodeType::SLOT_REF) { - return false; - } - for (auto& child : expr->children()) { - if (!visit_function_call(child.get())) { - return false; - } - } - return true; - }; - for (auto& ctx : _slot_id_to_filter_conjuncts->at(slot_id)) { - if (!visit_function_call(ctx->root().get())) { - return false; - } - } - return true; + // TODO: The current implementation of dictionary filtering does not take into account + // the implementation of NULL values because the dictionary itself does not contain + // NULL value encoding. As a result, many NULL-related functions or expressions + // cannot work properly, such as is null, is not null, coalesce, etc. + // Here we check if the predicate expr is IN or BINARY_PRED. + // Implementation of NULL value dictionary filtering will be carried out later. + return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id), [&](const auto& ctx) { + return (ctx->root()->node_type() == TExprNodeType::IN_PRED || + ctx->root()->node_type() == TExprNodeType::BINARY_PRED) && + ctx->root()->children()[0]->node_type() == TExprNodeType::SLOT_REF; + }); } // This function is copied from diff --git a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out index 2a8cebd8723..97b1fb2ff4c 100644 Binary files a/regression-test/data/external_table_p0/hive/test_string_dict_filter.out and b/regression-test/data/external_table_p0/hive/test_string_dict_filter.out differ diff --git a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy index 1929c813c55..18e62570ad7 100644 --- a/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy +++ b/regression-test/suites/external_table_p0/hive/test_string_dict_filter.groovy @@ -62,6 +62,24 @@ suite("test_string_dict_filter", "p0,external,hive,external_docker,external_dock qt_q15 """ select count(o_orderpriority) from ( select (case when o_orderpriority = 'x' then '1' when o_orderpriority = 'y' then '2' else '0' end) as o_orderpriority from test_string_dict_filter_parquet ) as A where o_orderpriority = '0'; """ + qt_q16 """ + select * from test_string_dict_filter_parquet where cast(o_orderstatus as string) = 'F'; + """ + qt_q17 """ + select * from test_string_dict_filter_parquet where cast(o_orderstatus as string) = 'O'; + """ + qt_q18 """ + select * from test_string_dict_filter_parquet where cast(o_orderstatus as string) in ('O', 'F'); + """ + qt_q19 """ + select * from test_string_dict_filter_parquet where cast(o_orderpriority as string) is null; + """ + qt_q20 """ + select * from test_string_dict_filter_parquet where cast(o_orderpriority as string) is not null; + """ + qt_q21 """ + select * from test_string_dict_filter_parquet where cast(o_orderpriority as string) in ('5-LOW', NULL); + """ } def q_orc = { qt_q01 """ @@ -109,6 +127,24 @@ suite("test_string_dict_filter", "p0,external,hive,external_docker,external_dock qt_q15 """ select count(o_orderpriority) from ( select (case when o_orderpriority = 'x' then '1' when o_orderpriority = 'y' then '2' else '0' end) as o_orderpriority from test_string_dict_filter_orc ) as A where o_orderpriority = '0'; """ + qt_q16 """ + select * from test_string_dict_filter_orc where cast(o_orderstatus as string) = 'F'; + """ + qt_q17 """ + select * from test_string_dict_filter_orc where cast(o_orderstatus as string) = 'O'; + """ + qt_q18 """ + select * from test_string_dict_filter_orc where cast(o_orderstatus as string) in ('O', 'F'); + """ + qt_q19 """ + select * from test_string_dict_filter_orc where cast(o_orderpriority as string) is null; + """ + qt_q20 """ + select * from test_string_dict_filter_orc where cast(o_orderpriority as string) is not null; + """ + qt_q21 """ + select * from test_string_dict_filter_orc where cast(o_orderpriority as string) in ('5-LOW', NULL); + """ } String enabled = context.config.otherConfigs.get("enableHiveTest") if (enabled == null || !enabled.equalsIgnoreCase("true")) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org