This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 7ed947eccfd [revert](orc) revert orc predicate pushdown related
commits (#47662)
7ed947eccfd is described below
commit 7ed947eccfdd4223be340d915247c629ef37ff75
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Sun Feb 9 18:44:45 2025 +0800
[revert](orc) revert orc predicate pushdown related commits (#47662)
revert:
branch-3.0: [fix](orc) ignore null values when the literals of
in_predicate contains #45104 (#45586)
[fix](orc) check all the cases before build_search_argument (#44615)
(#44802)
branch-3.0: [enhance](orc) Optimize ORC Predicate Pushdown for
OR-connected Predicate #43255 (#44436)
re-pick:
branch-3.0: [Fix](ORC) Not push down fixed char type in orc reader
#45484 (#45525)
---------
Co-authored-by: Socrates <[email protected]>
---
be/src/apache-orc | 2 +-
be/src/runtime/exec_env.h | 1 -
be/src/vec/exec/format/orc/vorc_reader.cpp | 496 ++++++---------------
be/src/vec/exec/format/orc/vorc_reader.h | 45 +-
be/test/exec/test_data/orc_scanner/orders.orc | Bin 1293 -> 0 bytes
be/test/testutil/desc_tbl_builder.cpp | 29 +-
be/test/testutil/desc_tbl_builder.h | 17 +-
be/test/vec/exec/orc_reader_test.cpp | 161 -------
.../data/external_table_p0/hive/test_hive_orc.out | Bin 92747 -> 90753 bytes
.../external_table_p0/hive/test_hive_orc.groovy | 34 --
.../hive/test_hive_orc_predicate.groovy | 2 +-
11 files changed, 179 insertions(+), 608 deletions(-)
diff --git a/be/src/apache-orc b/be/src/apache-orc
index 2f937bdc764..db01184f765 160000
--- a/be/src/apache-orc
+++ b/be/src/apache-orc
@@ -1 +1 @@
-Subproject commit 2f937bdc76406f150b484b6e57629aa8a03d48b6
+Subproject commit db01184f765c03496e4107bd3ac37c077ac4bc5f
diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h
index 40e7cc42922..886927fa68b 100644
--- a/be/src/runtime/exec_env.h
+++ b/be/src/runtime/exec_env.h
@@ -302,7 +302,6 @@ public:
static void set_tracking_memory(bool tracking_memory) {
_s_tracking_memory.store(tracking_memory, std::memory_order_release);
}
- void set_orc_memory_pool(orc::MemoryPool* pool) { _orc_memory_pool = pool;
}
void set_non_block_close_thread_pool(std::unique_ptr<ThreadPool>&& pool) {
_non_block_close_thread_pool = std::move(pool);
}
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 26e41afe3c9..151b5820515 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -18,14 +18,13 @@
#include "vorc_reader.h"
#include <cctz/civil_time_detail.h>
+#include <ctype.h>
#include <gen_cpp/Metrics_types.h>
-#include <gen_cpp/Opcodes_types.h>
#include <gen_cpp/PlanNodes_types.h>
#include <gen_cpp/Types_types.h>
#include <glog/logging.h>
#include <algorithm>
-#include <cctype>
// IWYU pragma: no_include <bits/chrono.h>
#include <chrono> // IWYU pragma: keep
#include <exception>
@@ -34,11 +33,12 @@
#include <memory>
#include <ostream>
#include <tuple>
-#include <utility>
+#include <variant>
#include "cctz/civil_time.h"
#include "cctz/time_zone.h"
#include "common/exception.h"
+#include "exec/olap_utils.h"
#include "exprs/create_predicate_function.h"
#include "exprs/hybrid_set.h"
#include "gutil/strings/substitute.h"
@@ -55,7 +55,6 @@
#include "runtime/descriptors.h"
#include "runtime/primitive_type.h"
#include "runtime/thread_context.h"
-#include "util/runtime_profile.h"
#include "util/slice.h"
#include "util/timezone_utils.h"
#include "vec/columns/column.h"
@@ -72,13 +71,15 @@
#include "vec/data_types/data_type_map.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_struct.h"
+#include "vec/exec/format/orc/orc_memory_pool.h"
#include "vec/exec/format/table/transactional_hive_common.h"
#include "vec/exprs/vbloom_predicate.h"
#include "vec/exprs/vdirect_in_predicate.h"
#include "vec/exprs/vectorized_fn_call.h"
-#include "vec/exprs/vexpr_fwd.h"
#include "vec/exprs/vin_predicate.h"
+#include "vec/exprs/vliteral.h"
#include "vec/exprs/vruntimefilter_wrapper.h"
+#include "vec/exprs/vslot_ref.h"
#include "vec/runtime/vdatetime_value.h"
namespace doris {
@@ -235,10 +236,6 @@ void OrcReader::_init_profile() {
ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeNullMapTime",
orc_profile, 1);
_orc_profile.filter_block_time =
ADD_CHILD_TIMER_WITH_LEVEL(_profile, "FilterBlockTime",
orc_profile, 1);
- _orc_profile.selected_row_group_count =
- ADD_COUNTER_WITH_LEVEL(_profile, "SelectedRowGroupCount",
TUnit::UNIT, 1);
- _orc_profile.evaluated_row_group_count =
- ADD_COUNTER_WITH_LEVEL(_profile, "EvaluatedRowGroupCount",
TUnit::UNIT, 1);
}
}
@@ -261,7 +258,6 @@ Status OrcReader::_create_file_reader() {
try {
orc::ReaderOptions options;
options.setMemoryPool(*ExecEnv::GetInstance()->orc_memory_pool());
- options.setReaderMetrics(&_reader_metrics);
_reader = orc::createReader(
std::unique_ptr<ORCFileInputStream>(_file_input_stream.release()), options);
} catch (std::exception& e) {
@@ -391,9 +387,6 @@ Status OrcReader::_init_read_columns() {
}
_col_name_to_file_col_name[col_name] = read_col;
- // TODO: refactor this
- std::transform(read_col.begin(), read_col.end(), read_col.begin(),
::tolower);
- _col_name_to_file_col_name_low_case[col_name] = read_col;
}
}
return Status::OK();
@@ -468,10 +461,8 @@ static std::unordered_map<orc::TypeKind,
orc::PredicateDataType> TYPEKIND_TO_PRE
{orc::TypeKind::BOOLEAN, orc::PredicateDataType::BOOLEAN}};
template <PrimitiveType primitive_type>
-std::tuple<bool, orc::Literal> convert_to_orc_literal(const orc::Type* type,
- StringRef& literal_data,
int precision,
- int scale) {
- const auto* value = literal_data.data;
+std::tuple<bool, orc::Literal> convert_to_orc_literal(const orc::Type* type,
const void* value,
+ int precision, int
scale) {
try {
switch (type->getKind()) {
case orc::TypeKind::BOOLEAN:
@@ -496,7 +487,8 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const
orc::Type* type,
// case orc::TypeKind::CHAR:
// [[fallthrough]];
case orc::TypeKind::VARCHAR: {
- return std::make_tuple(true, orc::Literal(literal_data.data,
literal_data.size));
+ StringRef* string_value = (StringRef*)value;
+ return std::make_tuple(true, orc::Literal(string_value->data,
string_value->size));
}
case orc::TypeKind::DECIMAL: {
int128_t decimal_value;
@@ -567,368 +559,179 @@ std::tuple<bool, orc::Literal>
convert_to_orc_literal(const orc::Type* type,
}
}
-std::tuple<bool, orc::Literal, orc::PredicateDataType>
OrcReader::_make_orc_literal(
- const VSlotRef* slot_ref, const VLiteral* literal) {
-
DCHECK(_col_name_to_file_col_name_low_case.contains(slot_ref->expr_name()));
- auto file_col_name_low_case =
_col_name_to_file_col_name_low_case[slot_ref->expr_name()];
- if (!_type_map.contains(file_col_name_low_case)) {
- // TODO: this is for acid table
- LOG(WARNING) << "Column " << slot_ref->expr_name() << " not found in
_type_map";
- return std::make_tuple(false, orc::Literal(false),
orc::PredicateDataType::LONG);
- }
- DCHECK(_type_map.contains(file_col_name_low_case));
- const auto* orc_type = _type_map[file_col_name_low_case];
- if (!TYPEKIND_TO_PREDICATE_TYPE.contains(orc_type->getKind())) {
- LOG(WARNING) << "Unsupported Push Down Orc Type [TypeKind=" <<
orc_type->getKind() << "]";
- return std::make_tuple(false, orc::Literal(false),
orc::PredicateDataType::LONG);
- }
- const auto predicate_type =
TYPEKIND_TO_PREDICATE_TYPE[orc_type->getKind()];
- if (literal == nullptr) {
- // only get the predicate_type
- return std::make_tuple(true, orc::Literal(true), predicate_type);
- }
- // this only happens when the literals of in_predicate contains null
value, like in (1, null)
- if (literal->get_column_ptr()->is_null_at(0)) {
- return std::make_tuple(false, orc::Literal(false), predicate_type);
- }
- auto literal_data = literal->get_column_ptr()->get_data_at(0);
- auto* slot = _tuple_descriptor->slots()[slot_ref->column_id()];
- auto slot_type = slot->type();
- auto primitive_type = slot_type.type;
- auto src_type = OrcReader::convert_to_doris_type(orc_type).type;
- // should not down predicate for string type change from other type
- if (src_type != primitive_type && !is_string_type(src_type) &&
is_string_type(primitive_type)) {
- LOG(WARNING) << "Unsupported Push Down Schema Changed Column " <<
primitive_type << " to "
- << src_type;
- return std::make_tuple(false, orc::Literal(false),
orc::PredicateDataType::LONG);
- }
- switch (primitive_type) {
-#define M(NAME)
\
- case TYPE_##NAME: {
\
- auto [valid, orc_literal] = convert_to_orc_literal<TYPE_##NAME>(
\
- orc_type, literal_data, slot_type.precision, slot_type.scale);
\
- return std::make_tuple(valid, orc_literal, predicate_type);
\
- }
-#define APPLY_FOR_PRIMITIVE_TYPE(M) \
- M(TINYINT) \
- M(SMALLINT) \
- M(INT) \
- M(BIGINT) \
- M(LARGEINT) \
- M(DATE) \
- M(DATETIME) \
- M(DATEV2) \
- M(DATETIMEV2) \
- M(VARCHAR) \
- M(STRING) \
- M(HLL) \
- M(DECIMAL32) \
- M(DECIMAL64) \
- M(DECIMAL128I) \
- M(DECIMAL256) \
- M(DECIMALV2) \
- M(BOOLEAN) \
- M(IPV4) \
- M(IPV6)
- APPLY_FOR_PRIMITIVE_TYPE(M)
-#undef M
- default: {
- VLOG_CRITICAL << "Unsupported Convert Orc Literal [ColName=" <<
slot->col_name() << "]";
- return std::make_tuple(false, orc::Literal(false), predicate_type);
- }
- }
-}
-
-// check if the slot of expr can be pushed down to orc reader and make orc
predicate type
-bool OrcReader::_check_slot_can_push_down(const VExprSPtr& expr) {
- if (!expr->children()[0]->is_slot_ref()) {
- return false;
- }
- const auto* slot_ref = static_cast<const
VSlotRef*>(expr->children()[0].get());
- // check if the slot exists in orc file and not partition column
- if (!_col_name_to_file_col_name.contains(slot_ref->expr_name()) ||
-
_lazy_read_ctx.predicate_partition_columns.contains(slot_ref->expr_name())) {
- return false;
- }
- auto [valid, _, predicate_type] = _make_orc_literal(slot_ref, nullptr);
- if (valid) {
- _vslot_ref_to_orc_predicate_data_type[slot_ref] = predicate_type;
- }
- return valid;
-}
-
-// check if the literal of expr can be pushed down to orc reader and make orc
literal
-bool OrcReader::_check_literal_can_push_down(const VExprSPtr& expr, uint16_t
child_id) {
- if (!expr->children()[child_id]->is_literal()) {
- return false;
- }
- // the slot has been checked in _check_slot_can_push_down before calling
this function
- const auto* slot_ref = static_cast<const
VSlotRef*>(expr->children()[0].get());
- const auto* literal = static_cast<const
VLiteral*>(expr->children()[child_id].get());
- auto [valid, orc_literal, _] = _make_orc_literal(slot_ref, literal);
- if (valid) {
- _vliteral_to_orc_literal.insert(std::make_pair(literal, orc_literal));
- }
- return valid;
-}
-
-// check if there are rest children of expr can be pushed down to orc reader
-bool OrcReader::_check_rest_children_can_push_down(const VExprSPtr& expr) {
- if (expr->children().size() < 2) {
- return false;
- }
+template <PrimitiveType primitive_type>
+std::vector<OrcPredicate> value_range_to_predicate(
+ const ColumnValueRange<primitive_type>& col_val_range, const
orc::Type* type) {
+ std::vector<OrcPredicate> predicates;
- bool at_least_one_child_can_push_down = false;
- for (size_t i = 1; i < expr->children().size(); ++i) {
- if (_check_literal_can_push_down(expr, i)) {
- at_least_one_child_can_push_down = true;
+ PrimitiveType src_type = OrcReader::convert_to_doris_type(type).type;
+ if (src_type != primitive_type) {
+ if (!(is_string_type(src_type) && is_string_type(primitive_type))) {
+ // not support schema change
+ return predicates;
}
}
- return at_least_one_child_can_push_down;
-}
-// check if the expr can be pushed down to orc reader
-bool OrcReader::_check_expr_can_push_down(const VExprSPtr& expr) {
- if (expr == nullptr) {
- return false;
+ orc::PredicateDataType predicate_data_type;
+ auto type_it = TYPEKIND_TO_PREDICATE_TYPE.find(type->getKind());
+ if (type_it == TYPEKIND_TO_PREDICATE_TYPE.end()) {
+ // Unsupported type
+ return predicates;
+ } else {
+ predicate_data_type = type_it->second;
}
- switch (expr->op()) {
- case TExprOpcode::COMPOUND_AND:
- // at least one child can be pushed down
- return std::ranges::any_of(expr->children(), [this](const auto& child)
{
- return _check_expr_can_push_down(child);
- });
- case TExprOpcode::COMPOUND_OR:
- // all children must be pushed down
- return std::ranges::all_of(expr->children(), [this](const auto& child)
{
- return _check_expr_can_push_down(child);
- });
- case TExprOpcode::COMPOUND_NOT:
- DCHECK_EQ(expr->children().size(), 1);
- return _check_expr_can_push_down(expr->children()[0]);
-
- case TExprOpcode::GE:
- case TExprOpcode::GT:
- case TExprOpcode::LE:
- case TExprOpcode::LT:
- case TExprOpcode::EQ:
- case TExprOpcode::NE:
- case TExprOpcode::FILTER_IN:
- case TExprOpcode::FILTER_NOT_IN:
- return _check_slot_can_push_down(expr) &&
_check_rest_children_can_push_down(expr);
-
- case TExprOpcode::INVALID_OPCODE:
- if (expr->node_type() == TExprNodeType::FUNCTION_CALL) {
- auto fn_name = expr->fn().name.function_name;
- // only support is_null_pred and is_not_null_pred
- if (fn_name == "is_null_pred" || fn_name == "is_not_null_pred") {
- return _check_slot_can_push_down(expr);
- }
- VLOG_CRITICAL << "Unsupported function [funciton=" << fn_name <<
"]";
+ if (col_val_range.is_fixed_value_range()) {
+ OrcPredicate in_predicate;
+ in_predicate.col_name = col_val_range.column_name();
+ in_predicate.data_type = predicate_data_type;
+ in_predicate.op = SQLFilterOp::FILTER_IN;
+ for (const auto& value : col_val_range.get_fixed_value_set()) {
+ auto [valid, literal] = convert_to_orc_literal<primitive_type>(
+ type, &value, col_val_range.precision(),
col_val_range.scale());
+ if (valid) {
+ in_predicate.literals.push_back(literal);
+ }
}
- return false;
- default:
- VLOG_CRITICAL << "Unsupported Opcode [OpCode=" << expr->op() << "]";
- return false;
+ if (!in_predicate.literals.empty()) {
+ predicates.emplace_back(in_predicate);
+ }
+ return predicates;
}
-}
-void OrcReader::_build_less_than(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder) {
- DCHECK(expr->children().size() == 2);
- DCHECK(expr->children()[0]->is_slot_ref());
- DCHECK(expr->children()[1]->is_literal());
- const auto* slot_ref = static_cast<const
VSlotRef*>(expr->children()[0].get());
- const auto* literal = static_cast<const
VLiteral*>(expr->children()[1].get());
- DCHECK(_vslot_ref_to_orc_predicate_data_type.contains(slot_ref));
- auto predicate_type = _vslot_ref_to_orc_predicate_data_type[slot_ref];
- DCHECK(_vliteral_to_orc_literal.contains(literal));
- auto orc_literal = _vliteral_to_orc_literal.find(literal)->second;
- builder->lessThan(slot_ref->expr_name(), predicate_type, orc_literal);
-}
-
-void OrcReader::_build_less_than_equals(const VExprSPtr& expr,
-
std::unique_ptr<orc::SearchArgumentBuilder>& builder) {
- DCHECK(expr->children().size() == 2);
- DCHECK(expr->children()[0]->is_slot_ref());
- DCHECK(expr->children()[1]->is_literal());
- const auto* slot_ref = static_cast<const
VSlotRef*>(expr->children()[0].get());
- const auto* literal = static_cast<const
VLiteral*>(expr->children()[1].get());
- DCHECK(_vslot_ref_to_orc_predicate_data_type.contains(slot_ref));
- auto predicate_type = _vslot_ref_to_orc_predicate_data_type[slot_ref];
- DCHECK(_vliteral_to_orc_literal.contains(literal));
- auto orc_literal = _vliteral_to_orc_literal.find(literal)->second;
- builder->lessThanEquals(slot_ref->expr_name(), predicate_type,
orc_literal);
-}
-
-void OrcReader::_build_equals(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder) {
- DCHECK(expr->children().size() == 2);
- DCHECK(expr->children()[0]->is_slot_ref());
- DCHECK(expr->children()[1]->is_literal());
- const auto* slot_ref = static_cast<const
VSlotRef*>(expr->children()[0].get());
- const auto* literal = static_cast<const
VLiteral*>(expr->children()[1].get());
- DCHECK(_vslot_ref_to_orc_predicate_data_type.contains(slot_ref));
- auto predicate_type = _vslot_ref_to_orc_predicate_data_type[slot_ref];
- DCHECK(_vliteral_to_orc_literal.contains(literal));
- auto orc_literal = _vliteral_to_orc_literal.find(literal)->second;
- builder->equals(slot_ref->expr_name(), predicate_type, orc_literal);
-}
+ const auto& high_value = col_val_range.get_range_max_value();
+ const auto& low_value = col_val_range.get_range_min_value();
+ const auto& high_op = col_val_range.get_range_high_op();
+ const auto& low_op = col_val_range.get_range_low_op();
-void OrcReader::_build_filter_in(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder) {
- DCHECK(expr->children().size() >= 2);
- DCHECK(expr->children()[0]->is_slot_ref());
- const auto* slot_ref = static_cast<const
VSlotRef*>(expr->children()[0].get());
- std::vector<orc::Literal> literals;
- DCHECK(_vslot_ref_to_orc_predicate_data_type.contains(slot_ref));
- orc::PredicateDataType predicate_type =
_vslot_ref_to_orc_predicate_data_type[slot_ref];
- for (size_t i = 1; i < expr->children().size(); ++i) {
- DCHECK(expr->children()[i]->is_literal());
- const auto* literal = static_cast<const
VLiteral*>(expr->children()[i].get());
- if (_vliteral_to_orc_literal.contains(literal)) {
- auto orc_literal = _vliteral_to_orc_literal.find(literal)->second;
- literals.emplace_back(orc_literal);
- }
- }
- DCHECK(!literals.empty());
- if (literals.size() == 1) {
- builder->equals(slot_ref->expr_name(), predicate_type, literals[0]);
- } else {
- builder->in(slot_ref->expr_name(), predicate_type, literals);
+ // orc can only push down is_null. When col_value_range._contain_null =
true, only indicating that
+ // value can be null, not equals null, so ignore _contain_null in
col_value_range
+ if (col_val_range.is_high_value_maximum() && high_op ==
SQLFilterOp::FILTER_LESS_OR_EQUAL &&
+ col_val_range.is_low_value_mininum() && low_op ==
SQLFilterOp::FILTER_LARGER_OR_EQUAL) {
+ return predicates;
}
-}
-void OrcReader::_build_is_null(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder) {
- DCHECK(expr->children().size() == 1);
- DCHECK(expr->children()[0]->is_slot_ref());
- const auto* slot_ref = static_cast<const
VSlotRef*>(expr->children()[0].get());
- DCHECK(_vslot_ref_to_orc_predicate_data_type.contains(slot_ref));
- auto predicate_type = _vslot_ref_to_orc_predicate_data_type[slot_ref];
- builder->isNull(slot_ref->expr_name(), predicate_type);
+ if (low_value < high_value) {
+ if (!col_val_range.is_low_value_mininum() ||
+ SQLFilterOp::FILTER_LARGER_OR_EQUAL != low_op) {
+ auto [valid, low_literal] = convert_to_orc_literal<primitive_type>(
+ type, &low_value, col_val_range.precision(),
col_val_range.scale());
+ if (valid) {
+ OrcPredicate low_predicate;
+ low_predicate.col_name = col_val_range.column_name();
+ low_predicate.data_type = predicate_data_type;
+ low_predicate.op = low_op;
+ low_predicate.literals.emplace_back(low_literal);
+ predicates.emplace_back(low_predicate);
+ }
+ }
+ if (!col_val_range.is_high_value_maximum() ||
+ SQLFilterOp::FILTER_LESS_OR_EQUAL != high_op) {
+ auto [valid, high_literal] =
convert_to_orc_literal<primitive_type>(
+ type, &high_value, col_val_range.precision(),
col_val_range.scale());
+ if (valid) {
+ OrcPredicate high_predicate;
+ high_predicate.col_name = col_val_range.column_name();
+ high_predicate.data_type = predicate_data_type;
+ high_predicate.op = high_op;
+ high_predicate.literals.emplace_back(high_literal);
+ predicates.emplace_back(high_predicate);
+ }
+ }
+ }
+ return predicates;
}
-bool OrcReader::_build_search_argument(const VExprSPtr& expr,
-
std::unique_ptr<orc::SearchArgumentBuilder>& builder) {
- // OPTIMIZE: check expr only once
- if (!_check_expr_can_push_down(expr)) {
+bool static build_search_argument(std::vector<OrcPredicate>& predicates, int
index,
+ std::unique_ptr<orc::SearchArgumentBuilder>&
builder) {
+ if (index >= predicates.size()) {
return false;
}
- switch (expr->op()) {
- case TExprOpcode::COMPOUND_AND: {
+ if (index < predicates.size() - 1) {
builder->startAnd();
- bool at_least_one_can_push_down = false;
- for (const auto& child : expr->children()) {
- if (_build_search_argument(child, builder)) {
- at_least_one_can_push_down = true;
- }
- }
- DCHECK(at_least_one_can_push_down);
- builder->end();
- break;
}
- case TExprOpcode::COMPOUND_OR: {
- builder->startOr();
- bool all_can_push_down = true;
- for (const auto& child : expr->children()) {
- if (!_build_search_argument(child, builder)) {
- all_can_push_down = false;
- }
+ OrcPredicate& predicate = predicates[index];
+ switch (predicate.op) {
+ case SQLFilterOp::FILTER_IN: {
+ if (predicate.literals.size() == 1) {
+ builder->equals(predicate.col_name, predicate.data_type,
predicate.literals[0]);
+ } else {
+ builder->in(predicate.col_name, predicate.data_type,
predicate.literals);
}
- DCHECK(all_can_push_down);
- builder->end();
break;
}
- case TExprOpcode::COMPOUND_NOT: {
- DCHECK_EQ(expr->children().size(), 1);
- builder->startNot();
- auto res = _build_search_argument(expr->children()[0], builder);
- DCHECK(res);
- builder->end();
- break;
- }
- case TExprOpcode::GE:
- builder->startNot();
- _build_less_than(expr, builder);
- builder->end();
- break;
- case TExprOpcode::GT:
- builder->startNot();
- _build_less_than_equals(expr, builder);
- builder->end();
- break;
- case TExprOpcode::LE:
- _build_less_than_equals(expr, builder);
+ case SQLFilterOp::FILTER_LESS:
+ builder->lessThan(predicate.col_name, predicate.data_type,
predicate.literals[0]);
break;
- case TExprOpcode::LT:
- _build_less_than(expr, builder);
+ case SQLFilterOp::FILTER_LESS_OR_EQUAL:
+ builder->lessThanEquals(predicate.col_name, predicate.data_type,
predicate.literals[0]);
break;
- case TExprOpcode::EQ:
- _build_equals(expr, builder);
- break;
- case TExprOpcode::NE:
+ case SQLFilterOp::FILTER_LARGER: {
builder->startNot();
- _build_equals(expr, builder);
+ builder->lessThanEquals(predicate.col_name, predicate.data_type,
predicate.literals[0]);
builder->end();
break;
- case TExprOpcode::FILTER_IN:
- _build_filter_in(expr, builder);
- break;
- case TExprOpcode::FILTER_NOT_IN:
+ }
+ case SQLFilterOp::FILTER_LARGER_OR_EQUAL: {
builder->startNot();
- _build_filter_in(expr, builder);
+ builder->lessThan(predicate.col_name, predicate.data_type,
predicate.literals[0]);
builder->end();
break;
- // is null and is not null is represented as function call
- case TExprOpcode::INVALID_OPCODE:
- DCHECK(expr->node_type() == TExprNodeType::FUNCTION_CALL);
- if (expr->fn().name.function_name == "is_null_pred") {
- _build_is_null(expr, builder);
- } else if (expr->fn().name.function_name == "is_not_null_pred") {
- builder->startNot();
- _build_is_null(expr, builder);
- builder->end();
- } else {
- // should not reach here, because _check_expr_can_push_down has
already checked
- __builtin_unreachable();
- }
- break;
-
+ }
default:
- // should not reach here, because _check_expr_can_push_down has
already checked
- __builtin_unreachable();
+ return false;
+ }
+ if (index < predicates.size() - 1) {
+ bool can_build = build_search_argument(predicates, index + 1, builder);
+ if (!can_build) {
+ return false;
+ }
+ builder->end();
}
return true;
}
-bool OrcReader::_init_search_argument(const VExprContextSPtrs& conjuncts) {
- if (!_enable_filter_by_min_max) {
+bool OrcReader::_init_search_argument(
+ std::unordered_map<std::string, ColumnValueRangeType>*
colname_to_value_range) {
+ if ((!_enable_filter_by_min_max) || colname_to_value_range->empty()) {
return false;
}
-
- // build search argument, if any expr can not be pushed down, return false
- auto builder = orc::SearchArgumentFactory::newBuilder();
- bool at_least_one_can_push_down = false;
- builder->startAnd();
- for (const auto& expr_ctx : conjuncts) {
- _vslot_ref_to_orc_predicate_data_type.clear();
- _vliteral_to_orc_literal.clear();
- if (_build_search_argument(expr_ctx->root(), builder)) {
- at_least_one_can_push_down = true;
+ std::vector<OrcPredicate> predicates;
+ auto& root_type = _reader->getType();
+ std::unordered_map<std::string, const orc::Type*> type_map;
+ for (int i = 0; i < root_type.getSubtypeCount(); ++i) {
+ type_map.emplace(get_field_name_lower_case(&root_type, i),
root_type.getSubtype(i));
+ }
+ for (auto& col_name : _lazy_read_ctx.all_read_columns) {
+ auto iter = colname_to_value_range->find(col_name);
+ if (iter == colname_to_value_range->end()) {
+ continue;
}
+ auto type_it = type_map.find(_col_name_to_file_col_name[col_name]);
+ if (type_it == type_map.end()) {
+ continue;
+ }
+ std::visit(
+ [&](auto& range) {
+ std::vector<OrcPredicate> value_predicates =
+ value_range_to_predicate(range, type_it->second);
+ for (auto& range_predicate : value_predicates) {
+ predicates.emplace_back(range_predicate);
+ }
+ },
+ iter->second);
}
- if (!at_least_one_can_push_down) {
- // if all exprs can not be pushed down, builder->end() will throw
exception
+ if (predicates.empty()) {
+ return false;
+ }
+ std::unique_ptr<orc::SearchArgumentBuilder> builder =
orc::SearchArgumentFactory::newBuilder();
+ if (build_search_argument(predicates, 0, builder)) {
+ std::unique_ptr<orc::SearchArgument> sargs = builder->build();
+ _row_reader_options.searchArgument(std::move(sargs));
+ return true;
+ } else {
return false;
}
- builder->end();
-
- auto sargs = builder->build();
- _profile->add_info_string("OrcReader SearchArgument: ", sargs->toString());
- _row_reader_options.searchArgument(std::move(sargs));
- return true;
}
Status OrcReader::set_fill_columns(
@@ -960,7 +763,7 @@ Status OrcReader::set_fill_columns(
visit_slot(child.get());
}
} else if (VInPredicate* in_predicate =
typeid_cast<VInPredicate*>(filter_impl)) {
- if (!in_predicate->children().empty()) {
+ if (in_predicate->children().size() > 0) {
visit_slot(in_predicate->children()[0].get());
}
} else {
@@ -1041,7 +844,7 @@ Status OrcReader::set_fill_columns(
_lazy_read_ctx.can_lazy_read = true;
}
- if (_lazy_read_ctx.conjuncts.empty() ||
!_init_search_argument(_lazy_read_ctx.conjuncts)) {
+ if (_colname_to_value_range == nullptr ||
!_init_search_argument(_colname_to_value_range)) {
_lazy_read_ctx.can_lazy_read = false;
}
try {
@@ -1196,8 +999,7 @@ Status OrcReader::_fill_partition_columns(
if (num_deserialized != rows) {
return Status::InternalError(
"Failed to fill partition column: {}={} ."
- "Number of rows expected to be written : {}, number of
rows actually "
- "written : "
+ "Number of rows expected to be written : {}, number of
rows actually written : "
"{}",
slot_desc->col_name(), value, num_deserialized, rows);
}
@@ -1808,12 +1610,6 @@ std::string OrcReader::get_field_name_lower_case(const
orc::Type* orc_type, int
Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) {
RETURN_IF_ERROR(get_next_block_impl(block, read_rows, eof));
- if (*eof) {
- COUNTER_UPDATE(_orc_profile.selected_row_group_count,
- _reader_metrics.SelectedRowGroupCount);
- COUNTER_UPDATE(_orc_profile.evaluated_row_group_count,
- _reader_metrics.EvaluatedRowGroupCount);
- }
if (_orc_filter) {
RETURN_IF_ERROR(_orc_filter->get_status());
}
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h
b/be/src/vec/exec/format/orc/vorc_reader.h
index 6bbf3bead1e..3154c39a97d 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -18,9 +18,9 @@
#pragma once
#include <cctz/time_zone.h>
+#include <stddef.h>
+#include <stdint.h>
-#include <cstddef>
-#include <cstdint>
#include <list>
#include <memory>
#include <orc/OrcFile.hh>
@@ -41,7 +41,6 @@
#include "orc/Reader.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
-#include "orc/sargs/Literal.hh"
#include "runtime/types.h"
#include "util/runtime_profile.h"
#include "vec/aggregate_functions/aggregate_function.h"
@@ -52,8 +51,6 @@
#include "vec/exec/format/format_common.h"
#include "vec/exec/format/generic_reader.h"
#include "vec/exec/format/table/transactional_hive_reader.h"
-#include "vec/exprs/vliteral.h"
-#include "vec/exprs/vslot_ref.h"
namespace doris {
class RuntimeState;
@@ -83,6 +80,13 @@ namespace doris::vectorized {
class ORCFileInputStream;
+struct OrcPredicate {
+ std::string col_name;
+ orc::PredicateDataType data_type;
+ std::vector<orc::Literal> literals;
+ SQLFilterOp op;
+};
+
struct LazyReadContext {
VExprContextSPtrs conjuncts;
bool can_lazy_read = false;
@@ -223,8 +227,6 @@ private:
RuntimeProfile::Counter* decode_value_time = nullptr;
RuntimeProfile::Counter* decode_null_map_time = nullptr;
RuntimeProfile::Counter* filter_block_time = nullptr;
- RuntimeProfile::Counter* selected_row_group_count = nullptr;
- RuntimeProfile::Counter* evaluated_row_group_count = nullptr;
};
class ORCFilterImpl : public orc::ORCFilter {
@@ -288,27 +290,8 @@ private:
bool* is_hive1_orc);
static bool _check_acid_schema(const orc::Type& type);
static const orc::Type& _remove_acid(const orc::Type& type);
-
- // functions for building search argument until _init_search_argument
- std::tuple<bool, orc::Literal, orc::PredicateDataType> _make_orc_literal(
- const VSlotRef* slot_ref, const VLiteral* literal);
- bool _check_slot_can_push_down(const VExprSPtr& expr);
- bool _check_literal_can_push_down(const VExprSPtr& expr, uint16_t
child_id);
- bool _check_rest_children_can_push_down(const VExprSPtr& expr);
- bool _check_expr_can_push_down(const VExprSPtr& expr);
- void _build_less_than(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder);
- void _build_less_than_equals(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder);
- void _build_equals(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
- void _build_filter_in(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder);
- void _build_is_null(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>& builder);
- bool _build_search_argument(const VExprSPtr& expr,
- std::unique_ptr<orc::SearchArgumentBuilder>&
builder);
- bool _init_search_argument(const VExprContextSPtrs& conjuncts);
-
+ bool _init_search_argument(
+ std::unordered_map<std::string, ColumnValueRangeType>*
colname_to_value_range);
void _init_bloom_filter(
std::unordered_map<std::string, ColumnValueRangeType>*
colname_to_value_range);
void _init_system_properties();
@@ -594,14 +577,11 @@ private:
bool _is_hive1_orc_or_use_idx = false;
std::unordered_map<std::string, std::string> _col_name_to_file_col_name;
- // TODO: check if we can remove _col_name_to_file_col_name_low_case
- std::unordered_map<std::string, std::string>
_col_name_to_file_col_name_low_case;
std::unordered_map<std::string, const orc::Type*> _type_map;
std::vector<const orc::Type*> _col_orc_type;
std::unique_ptr<ORCFileInputStream> _file_input_stream;
Statistics _statistics;
OrcProfile _orc_profile;
- orc::ReaderMetrics _reader_metrics;
std::unique_ptr<orc::ColumnVectorBatch> _batch;
std::unique_ptr<orc::Reader> _reader;
@@ -647,9 +627,6 @@ private:
std::unordered_map<std::string, std::string> _table_col_to_file_col;
//support iceberg position delete .
std::vector<int64_t>* _position_delete_ordered_rowids = nullptr;
- std::unordered_map<const VSlotRef*, orc::PredicateDataType>
- _vslot_ref_to_orc_predicate_data_type;
- std::unordered_map<const VLiteral*, orc::Literal> _vliteral_to_orc_literal;
};
class ORCFileInputStream : public orc::InputStream, public ProfileCollector {
diff --git a/be/test/exec/test_data/orc_scanner/orders.orc
b/be/test/exec/test_data/orc_scanner/orders.orc
deleted file mode 100644
index 6fad5043288..00000000000
Binary files a/be/test/exec/test_data/orc_scanner/orders.orc and /dev/null
differ
diff --git a/be/test/testutil/desc_tbl_builder.cpp
b/be/test/testutil/desc_tbl_builder.cpp
index 6404d1c5449..4cba9a44a4b 100644
--- a/be/test/testutil/desc_tbl_builder.cpp
+++ b/be/test/testutil/desc_tbl_builder.cpp
@@ -17,9 +17,20 @@
#include "testutil/desc_tbl_builder.h"
-#include <gtest/gtest.h>
+#include <glog/logging.h>
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+#include <vector>
+
+#include "common/object_pool.h"
#include "common/status.h"
+#include "gtest/gtest_pred_impl.h"
+#include "runtime/define_primitive_type.h"
+#include "runtime/descriptors.h"
+#include "util/bit_util.h"
+
+using std::vector;
namespace doris {
@@ -33,7 +44,7 @@ TupleDescBuilder& DescriptorTblBuilder::declare_tuple() {
// item_id of -1 indicates no itemTupleId
static TSlotDescriptor make_slot_descriptor(int id, int parent_id, const
TypeDescriptor& type,
- const std::string& name, int
slot_idx, int item_id) {
+ int slot_idx, int item_id) {
int null_byte = slot_idx / 8;
int null_bit = slot_idx % 8;
TSlotDescriptor slot_desc;
@@ -47,7 +58,6 @@ static TSlotDescriptor make_slot_descriptor(int id, int
parent_id, const TypeDes
slot_desc.__set_nullIndicatorBit(null_bit);
slot_desc.__set_slotIdx(slot_idx);
slot_desc.__set_isMaterialized(true);
- slot_desc.__set_colName(name);
// if (item_id != -1) {
// slot_desc.__set_itemTupleId(item_id);
// }
@@ -68,9 +78,8 @@ DescriptorTbl* DescriptorTblBuilder::build() {
int tuple_id = 0;
int slot_id = 0;
- for (auto& _tuples_desc : _tuples_descs) {
- build_tuple(_tuples_desc->slot_types(), _tuples_desc->slot_names(),
&thrift_desc_tbl,
- &tuple_id, &slot_id);
+ for (int i = 0; i < _tuples_descs.size(); ++i) {
+ build_tuple(_tuples_descs[i]->slot_types(), &thrift_desc_tbl,
&tuple_id, &slot_id);
}
Status status = DescriptorTbl::create(_obj_pool, thrift_desc_tbl,
&desc_tbl);
@@ -78,8 +87,7 @@ DescriptorTbl* DescriptorTblBuilder::build() {
return desc_tbl;
}
-TTupleDescriptor DescriptorTblBuilder::build_tuple(const
std::vector<TypeDescriptor>& slot_types,
- const
std::vector<std::string>& slot_names,
+TTupleDescriptor DescriptorTblBuilder::build_tuple(const
vector<TypeDescriptor>& slot_types,
TDescriptorTable*
thrift_desc_tbl,
int* next_tuple_id, int*
slot_id) {
// We never materialize struct slots (there's no in-memory representation
of structs,
@@ -87,8 +95,7 @@ TTupleDescriptor DescriptorTblBuilder::build_tuple(const
std::vector<TypeDescrip
// still have a struct item type. In this case, the array item tuple
contains the
// "inlined" struct fields.
if (slot_types.size() == 1 && slot_types[0].type == TYPE_STRUCT) {
- return build_tuple(slot_types[0].children, slot_types[0].field_names,
thrift_desc_tbl,
- next_tuple_id, slot_id);
+ return build_tuple(slot_types[0].children, thrift_desc_tbl,
next_tuple_id, slot_id);
}
int tuple_id = *next_tuple_id;
@@ -104,7 +111,7 @@ TTupleDescriptor DescriptorTblBuilder::build_tuple(const
std::vector<TypeDescrip
// }
thrift_desc_tbl->slotDescriptors.push_back(
- make_slot_descriptor(*slot_id, tuple_id, slot_types[i],
slot_names[i], i, item_id));
+ make_slot_descriptor(*slot_id, tuple_id, slot_types[i], i,
item_id));
thrift_desc_tbl->__isset.slotDescriptors = true;
++(*slot_id);
}
diff --git a/be/test/testutil/desc_tbl_builder.h
b/be/test/testutil/desc_tbl_builder.h
index 968b29bd001..c29ef9acd43 100644
--- a/be/test/testutil/desc_tbl_builder.h
+++ b/be/test/testutil/desc_tbl_builder.h
@@ -20,16 +20,15 @@
#include <gen_cpp/Descriptors_types.h>
-#include <tuple>
#include <vector>
-#include "common/object_pool.h"
-#include "runtime/descriptors.h"
#include "runtime/types.h"
namespace doris {
+class ObjectPool;
class TupleDescBuilder;
+class DescriptorTbl;
// Aids in the construction of a DescriptorTbl by declaring tuples and slots
// associated with those tuples.
@@ -41,7 +40,6 @@ class TupleDescBuilder;
// DescriptorTblBuilder builder;
// builder.declare_tuple() << TYPE_TINYINT << TYPE_TIMESTAMP; // gets TupleId 0
// builder.declare_tuple() << TYPE_FLOAT; // gets TupleId 1
-// builder.declare_tuple() << std::make_tuple(TYPE_INT, "col1") <<
std::make_tuple(TYPE_STRING, "col2"); // gets Tuple with type and name
// DescriptorTbl desc_tbl = builder.build();
class DescriptorTblBuilder {
public:
@@ -59,31 +57,20 @@ private:
std::vector<TupleDescBuilder*> _tuples_descs;
TTupleDescriptor build_tuple(const std::vector<TypeDescriptor>& slot_types,
- const std::vector<std::string>& slot_names,
TDescriptorTable* thrift_desc_tbl, int*
tuple_id, int* slot_id);
};
class TupleDescBuilder {
public:
- using SlotType = std::tuple<TypeDescriptor, std::string>;
- TupleDescBuilder& operator<<(const SlotType& slot) {
- _slot_types.push_back(std::get<0>(slot));
- _slot_names.push_back(std::get<1>(slot));
- return *this;
- }
-
TupleDescBuilder& operator<<(const TypeDescriptor& slot_type) {
_slot_types.push_back(slot_type);
- _slot_names.emplace_back("");
return *this;
}
std::vector<TypeDescriptor> slot_types() const { return _slot_types; }
- std::vector<std::string> slot_names() const { return _slot_names; }
private:
std::vector<TypeDescriptor> _slot_types;
- std::vector<std::string> _slot_names;
};
} // end namespace doris
diff --git a/be/test/vec/exec/orc_reader_test.cpp
b/be/test/vec/exec/orc_reader_test.cpp
deleted file mode 100644
index ff7452ae625..00000000000
--- a/be/test/vec/exec/orc_reader_test.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <memory>
-#include <string>
-#include <tuple>
-#include <vector>
-
-#include "orc/sargs/SearchArgument.hh"
-#include "runtime/define_primitive_type.h"
-#include "runtime/exec_env.h"
-#include "runtime/runtime_state.h"
-#include "testutil/desc_tbl_builder.h"
-#include "vec/exec/format/orc/orc_memory_pool.h"
-#include "vec/exec/format/orc/vorc_reader.h"
-#include "vec/exprs/vexpr_context.h"
-#include "vec/exprs/vexpr_fwd.h"
-#include "vec/utils/util.hpp"
-namespace doris::vectorized {
-class OrcReaderTest : public testing::Test {
-public:
- OrcReaderTest() = default;
- ~OrcReaderTest() override = default;
-
-private:
- static constexpr const char* CANNOT_PUSH_DOWN_ERROR = "can't push down";
- std::string build_search_argument(const std::string& expr) {
- // build orc_reader for table orders
- std::vector<std::string> column_names = {
- "o_orderkey", "o_custkey", "o_orderstatus",
"o_totalprice", "o_orderdate",
- "o_orderpriority", "o_clerk", "o_shippriority", "o_comment"};
- ObjectPool object_pool;
- DescriptorTblBuilder builder(&object_pool);
- builder.declare_tuple() << std::make_tuple(TYPE_INT, "o_orderkey")
- << std::make_tuple(TYPE_INT, "o_custkey")
- << std::make_tuple(TYPE_STRING,
"o_orderstatus")
- << std::make_tuple(TYPE_DOUBLE, "o_totalprice")
- << std::make_tuple(TYPE_DATE, "o_orderdate")
- << std::make_tuple(TYPE_STRING,
"o_orderpriority")
- << std::make_tuple(TYPE_STRING, "o_clerk")
- << std::make_tuple(TYPE_INT, "o_shippriority")
- << std::make_tuple(TYPE_STRING, "o_comment");
- DescriptorTbl* desc_tbl = builder.build();
- auto* tuple_desc =
const_cast<TupleDescriptor*>(desc_tbl->get_tuple_descriptor(0));
- RowDescriptor row_desc(tuple_desc, false);
- TFileScanRangeParams params;
- TFileRangeDesc range;
- range.path = "./be/test/exec/test_data/orc_scanner/orders.orc";
- range.start_offset = 0;
- range.size = 1293;
- auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
- auto status = reader->init_reader(&column_names, nullptr, {}, false,
tuple_desc, &row_desc,
- nullptr, nullptr);
- EXPECT_TRUE(status.ok());
-
- // deserialize expr
- auto exprx = apache::thrift::from_json_string<TExpr>(expr);
- VExprContextSPtr context;
- status = VExpr::create_expr_tree(exprx, context);
- EXPECT_TRUE(status.ok());
-
- // prepare expr context
- RuntimeState state;
- state.set_desc_tbl(desc_tbl);
- status = context->prepare(&state, row_desc);
- EXPECT_TRUE(status.ok());
-
- // build search argument
- auto sarg_builder = orc::SearchArgumentFactory::newBuilder();
- auto res = reader->_build_search_argument(context->root(),
sarg_builder);
- if (!res) {
- return CANNOT_PUSH_DOWN_ERROR;
- }
- return sarg_builder->build()->toString();
- }
-};
-
-TEST_F(OrcReaderTest, test_build_search_argument) {
- ExecEnv::GetInstance()->set_orc_memory_pool(new ORCMemoryPool());
- std::vector<std::string>
- exprs =
- {
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey < 100 or o_orderkey > 5999900 or o_orderkey in (1000000,
2000000, 3000000);
-
R"|({"1":{"lst":["rec",13,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey is null or (o_orderkey between 100 and 1000 and o_orderkey not
in (200, 300, 400));
-
R"|({"1":{"lst":["rec",16,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":20},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"4":{"i32":1},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"is_null_pred"}}},"2":{"i32":0},"3":{"lst":["rec",1,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey is null or (o_orderkey between 1000000 and 1200000 and
o_orderkey != 1100000);
-
R"|({"1":{"lst":["rec",14,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":20},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"4":{"i32":1},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"is_null_pred"}}},"2":{"i32":0},"3":{"lst":["rec",1,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":
[...]
- // SELECT count(o_orderkey) FROM tpch1_orc.orders
WHERE o_orderkey IN (1000000, 2000000, 3000000) OR (o_orderdate >= '1994-01-01'
AND o_orderdate <= '1994-12-31');
-
R"|({"1":{"lst":["rec",13,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":11},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":5},"4":{"i32":4},"11":{"rec":{"1":{"tf":0}}},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":16},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"re
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey < 2 or (o_comment like '%delayed%' and o_orderpriority =
'1-URGENT');
-
R"|({"1":{"lst":["rec",11,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey < 2 or (o_totalprice < 173665.47 and o_custkey >= 36901);
-
R"|({"1":{"lst":["rec",11,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey < 1 + 1;
-
R"|({"1":{"lst":["rec",3,{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}},{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}}]},"4":{"rec":{"1":{"lst":["rec",1,{"1":{
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey in (null, 25);
-
R"|({"1":{"lst":["rec",4,{"1":{"i32":11},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":5},"4":{"i32":3},"11":{"rec":{"1":{"tf":0}}},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":16},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}}},"4":{"i32":0},"15":{"rec":{"1":{"i32":0},"2":{"i32":0},"3":{"i32":-1}}},"20":{"i32":-1},"29":{"tf":1},"36":{"str":"o_orderkey"}},{
[...]
- // SELECT count(o_orderkey) FROM tpch1_orc.orders
WHERE o_comment LIKE '%delayed%' OR o_orderpriority = '1-URGENT';
-
R"|({"1":{"lst":["rec",7,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":20},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"like"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey between 1 and 100 or random() > 0.5;
-
R"|({"1":{"lst":["rec",11,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":2},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where lower(o_orderpriority) = '1-urgent';
-
R"|({"1":{"lst":["rec",4,{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":9},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"eq"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":15},"2":{"i32":65533}}}}]},"3":{"i64":-1}},{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":15},"2":{"i32":65533}}}}]},"3":{"i64":-1}}]},"
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderkey * 2 < 60;
-
R"|({"1":{"lst":["rec",5,{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":6}}}}]},"3":{"i64":-1}},{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":6}}}}]},"3":{"i64":-1}}]},"4":{"rec":{"1":{"lst":["rec",1,{"1":{
[...]
- // select count(o_orderkey) from tpch1_orc.orders
where o_orderdate is not null;
-
R"|({"1":{"lst":["rec",4,{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":9},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"eq"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":27},"3":{"i32":18},"4":{"i32":0}}}}]},"3":{"i64":-1}},{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":27},"3":{"i32":18},"4":{"i32":0}}}}
[...]
- };
- std::vector<std::string> result_search_arguments = {
- "leaf-0 = (o_orderkey < 100), leaf-1 = (o_orderkey <= 5999900),
leaf-2 "
- "= (o_orderkey "
- "in "
- "[1000000, 2000000, 3000000]), expr = (or leaf-0 (not leaf-1)
leaf-2)",
- "leaf-0 = (o_orderkey is null), leaf-1 = (o_orderkey < 100),
leaf-2 = "
- "(o_orderkey <= "
- "1000), leaf-3 = (o_orderkey in [200, 300, 400]), expr = (and (or "
- "leaf-0 (not leaf-1)) "
- "(or leaf-0 leaf-2) (or leaf-0 (not leaf-3)))",
- "leaf-0 = (o_orderkey is null), leaf-1 = (o_orderkey < 1000000),
leaf-2 = (o_orderkey "
- "<= 1200000), leaf-3 = (o_orderkey = 1100000), expr = (and (or
leaf-0 (not leaf-1)) "
- "(or leaf-0 leaf-2) (or leaf-0 (not leaf-3)))",
- "leaf-0 = (o_orderkey in [1000000, 2000000, 3000000]), leaf-1 =
(o_orderdate < "
- "17121205), leaf-2 = (o_orderdate <= 17121205), expr = (and (or
leaf-0 (not leaf-1)) "
- "(or leaf-0 leaf-2))",
- "leaf-0 = (o_orderkey < 2), leaf-1 = (o_orderpriority = 1-URGENT),
expr = (or leaf-0 "
- "leaf-1)",
- "leaf-0 = (o_orderkey < 2), leaf-1 = (o_custkey < 36901), expr =
(or leaf-0 (not "
- "leaf-1))",
- "leaf-0 = (o_orderkey < 2), expr = leaf-0",
- "leaf-0 = (o_orderkey = 25), expr = leaf-0",
- CANNOT_PUSH_DOWN_ERROR,
- CANNOT_PUSH_DOWN_ERROR,
- CANNOT_PUSH_DOWN_ERROR,
- CANNOT_PUSH_DOWN_ERROR,
- CANNOT_PUSH_DOWN_ERROR,
- };
- for (int i = 0; i < exprs.size(); i++) {
- auto search_argument = build_search_argument(exprs[i]);
- ASSERT_EQ(search_argument, result_search_arguments[i]);
- }
-}
-
-} // namespace doris::vectorized
diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc.out
b/regression-test/data/external_table_p0/hive/test_hive_orc.out
index 03942dbe9fb..066c5d4b4d3 100644
Binary files a/regression-test/data/external_table_p0/hive/test_hive_orc.out
and b/regression-test/data/external_table_p0/hive/test_hive_orc.out differ
diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
index 8d85feaa77a..0f837c0abd3 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
@@ -81,39 +81,6 @@ suite("test_hive_orc",
"all_types,p0,external,hive,external_docker,external_dock
qt_string_col_dict_plain_mixed3 """select count(col2) from
string_col_dict_plain_mixed_orc where col1 like '%Test%';"""
}
- def predicate_pushdown = {
- qt_predicate_pushdown1 """ select count(o_orderkey) from
tpch1_orc.orders where o_orderkey is not null and (o_orderkey < 100 or
o_orderkey > 5999900 or o_orderkey in (1000000, 2000000, 3000000)); """
- qt_predicate_pushdown2 """ select count(o_orderkey) from
tpch1_orc.orders where o_orderkey is null or (o_orderkey between 100 and 1000
and o_orderkey not in (200, 300, 400)); """
- qt_predicate_pushdown3 """ select count(o_orderkey) from
tpch1_orc.orders where o_orderkey is not null and (o_orderkey < 100 or
o_orderkey > 5999900 or o_orderkey = 3000000); """
- qt_predicate_pushdown4 """ select count(o_orderkey) from
tpch1_orc.orders where o_orderkey is null or (o_orderkey between 1000000 and
1200000 and o_orderkey != 1100000); """
- qt_predicate_pushdown5 """ SELECT count(o_orderkey) FROM
tpch1_orc.orders WHERE (o_orderdate >= '1994-01-01' AND o_orderdate <=
'1994-12-31') AND (o_orderpriority = '5-LOW' OR o_orderpriority = '3-MEDIUM')
AND o_totalprice > 2000;"""
- qt_predicate_pushdown6 """ SELECT count(o_orderkey) FROM
tpch1_orc.orders WHERE o_orderstatus <> 'F' AND o_custkey < 54321; """
- qt_predicate_pushdown7 """ SELECT count(o_orderkey) FROM
tpch1_orc.orders WHERE o_comment LIKE '%delayed%' OR o_orderpriority =
'1-URGENT'; """
- qt_predicate_pushdown8 """ SELECT count(o_orderkey) FROM
tpch1_orc.orders WHERE o_orderkey IN (1000000, 2000000, 3000000) OR o_clerk =
'Clerk#000000470'; """
-
- qt_predicate_pushdown_in1 """ select count(*) from orc_all_types
where boolean_col in (null); """
- qt_predicate_pushdown_in2 """ select count(*) from orc_all_types
where boolean_col in (null, 0); """
- qt_predicate_pushdown_in3 """ select count(*) from orc_all_types
where boolean_col in (null, 1); """
-
- def test_col_is_null = { String col ->
- "qt_orc_all_types_${col}_is_null" """ select count(*) from
orc_all_types where ${col} is null; """
- }
- test_col_is_null("tinyint_col")
- test_col_is_null("smallint_col")
- test_col_is_null("int_col")
- test_col_is_null("bigint_col")
- test_col_is_null("boolean_col")
- test_col_is_null("float_col")
- test_col_is_null("double_col")
- test_col_is_null("string_col")
- test_col_is_null("binary_col")
- test_col_is_null("timestamp_col")
- test_col_is_null("decimal_col")
- test_col_is_null("char_col")
- test_col_is_null("varchar_col")
- test_col_is_null("date_col")
- }
-
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
@@ -141,7 +108,6 @@ suite("test_hive_orc",
"all_types,p0,external,hive,external_docker,external_dock
only_partition_col()
decimals()
string_col_dict_plain_mixed()
- predicate_pushdown()
sql """drop catalog if exists ${catalog_name}"""
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
index 2dd647aa2c1..d9b6357ca0a 100644
---
a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
@@ -19,7 +19,7 @@ suite("test_hive_orc_predicate",
"p0,external,hive,external_docker,external_dock
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
- logger.info("diable Hive test.")
+ logger.info("disable Hive test.")
return;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]