This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 212f087d feat: add inclusive metrics evaluator (#357)
212f087d is described below
commit 212f087d37d5c5f0d14a95dbe0ef3f8d4463a58d
Author: dongxiao <[email protected]>
AuthorDate: Tue Dec 2 23:56:00 2025 +0800
feat: add inclusive metrics evaluator (#357)
---
src/iceberg/CMakeLists.txt | 1 +
src/iceberg/expression/expressions.cc | 162 ----
src/iceberg/expression/expressions.h | 125 ++-
.../expression/inclusive_metrics_evaluator.cc | 521 +++++++++++
.../expression/inclusive_metrics_evaluator.h | 73 ++
src/iceberg/expression/literal.cc | 22 +-
src/iceberg/meson.build | 1 +
src/iceberg/test/CMakeLists.txt | 2 +
.../test/inclusive_metrics_evaluator_test.cc | 948 +++++++++++++++++++++
...lusive_metrics_evaluator_with_transform_test.cc | 485 +++++++++++
src/iceberg/test/meson.build | 2 +
11 files changed, 2159 insertions(+), 183 deletions(-)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index d5429808..7c71f651 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -25,6 +25,7 @@ set(ICEBERG_SOURCES
expression/evaluator.cc
expression/expression.cc
expression/expressions.cc
+ expression/inclusive_metrics_evaluator.cc
expression/literal.cc
expression/predicate.cc
expression/rewrite_not.cc
diff --git a/src/iceberg/expression/expressions.cc
b/src/iceberg/expression/expressions.cc
index 786cc0ab..7eef6023 100644
--- a/src/iceberg/expression/expressions.cc
+++ b/src/iceberg/expression/expressions.cc
@@ -156,56 +156,21 @@ std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::IsNull(
return IsNull<BoundReference>(Ref(std::move(name)));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::IsNull(
- std::shared_ptr<UnboundTerm<B>> expr) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred,
- UnboundPredicateImpl<B>::Make(Expression::Operation::kIsNull,
std::move(expr)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::NotNull(
std::string name) {
return NotNull<BoundReference>(Ref(std::move(name)));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::NotNull(
- std::shared_ptr<UnboundTerm<B>> expr) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred,
- UnboundPredicateImpl<B>::Make(Expression::Operation::kNotNull,
std::move(expr)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::IsNaN(
std::string name) {
return IsNaN<BoundReference>(Ref(std::move(name)));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::IsNaN(
- std::shared_ptr<UnboundTerm<B>> expr) {
- ICEBERG_ASSIGN_OR_THROW(auto pred, UnboundPredicateImpl<B>::Make(
- Expression::Operation::kIsNan,
std::move(expr)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::NotNaN(
std::string name) {
return NotNaN<BoundReference>(Ref(std::move(name)));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::NotNaN(
- std::shared_ptr<UnboundTerm<B>> expr) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred,
- UnboundPredicateImpl<B>::Make(Expression::Operation::kNotNan,
std::move(expr)));
- return pred;
-}
-
// Template implementations for comparison predicates
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::LessThan(
@@ -213,85 +178,31 @@ std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::LessThan(
return LessThan<BoundReference>(Ref(std::move(name)), std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::LessThan(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kLt,
- std::move(expr),
std::move(value)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::LessThanOrEqual(
std::string name, Literal value) {
return LessThanOrEqual<BoundReference>(Ref(std::move(name)),
std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::LessThanOrEqual(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kLtEq,
- std::move(expr),
std::move(value)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::GreaterThan(
std::string name, Literal value) {
return GreaterThan<BoundReference>(Ref(std::move(name)), std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::GreaterThan(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kGt,
- std::move(expr),
std::move(value)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::GreaterThanOrEqual(
std::string name, Literal value) {
return GreaterThanOrEqual<BoundReference>(Ref(std::move(name)),
std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::GreaterThanOrEqual(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kGtEq,
- std::move(expr),
std::move(value)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::Equal(std::string name,
Literal value) {
return Equal<BoundReference>(Ref(std::move(name)), std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::Equal(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kEq,
- std::move(expr),
std::move(value)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::NotEqual(
std::string name, Literal value) {
return NotEqual<BoundReference>(Ref(std::move(name)), std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::NotEqual(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kNotEq,
- std::move(expr),
std::move(value)));
- return pred;
-}
-
// String predicates
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::StartsWith(
@@ -299,31 +210,11 @@ std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::StartsWith(
return StartsWith<BoundReference>(Ref(std::move(name)), std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::StartsWith(
- std::shared_ptr<UnboundTerm<B>> expr, std::string value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred,
- UnboundPredicateImpl<B>::Make(Expression::Operation::kStartsWith,
std::move(expr),
- Literal::String(std::move(value))));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::NotStartsWith(
std::string name, std::string value) {
return NotStartsWith<BoundReference>(Ref(std::move(name)), std::move(value));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::NotStartsWith(
- std::shared_ptr<UnboundTerm<B>> expr, std::string value) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred,
- UnboundPredicateImpl<B>::Make(Expression::Operation::kNotStartsWith,
- std::move(expr),
Literal::String(std::move(value))));
- return pred;
-}
-
// Template implementations for set predicates
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::In(
@@ -331,51 +222,21 @@ std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::In(
return In<BoundReference>(Ref(std::move(name)), std::move(values));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::In(
- std::shared_ptr<UnboundTerm<B>> expr, std::vector<Literal> values) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kIn,
- std::move(expr),
std::move(values)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::In(
std::string name, std::initializer_list<Literal> values) {
return In<BoundReference>(Ref(std::move(name)),
std::vector<Literal>(values));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::In(
- std::shared_ptr<UnboundTerm<B>> expr, std::initializer_list<Literal>
values) {
- return In<B>(std::move(expr), std::vector<Literal>(values));
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::NotIn(
std::string name, std::vector<Literal> values) {
return NotIn<BoundReference>(Ref(std::move(name)), std::move(values));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::NotIn(
- std::shared_ptr<UnboundTerm<B>> expr, std::vector<Literal> values) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kNotIn,
- std::move(expr),
std::move(values)));
- return pred;
-}
-
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::NotIn(
std::string name, std::initializer_list<Literal> values) {
return NotIn<BoundReference>(Ref(std::move(name)),
std::vector<Literal>(values));
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::NotIn(
- std::shared_ptr<UnboundTerm<B>> expr, std::initializer_list<Literal>
values) {
- return NotIn<B>(expr, std::vector<Literal>(values));
-}
-
// Template implementations for generic predicate factory
std::shared_ptr<UnboundPredicateImpl<BoundReference>> Expressions::Predicate(
@@ -404,29 +265,6 @@ std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Expressions::Predicate(
return pred;
}
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::Predicate(
- Expression::Operation op, std::shared_ptr<UnboundTerm<B>> expr,
- std::vector<Literal> values) {
- ICEBERG_ASSIGN_OR_THROW(
- auto pred, UnboundPredicateImpl<B>::Make(op, std::move(expr),
std::move(values)));
- return pred;
-}
-
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::Predicate(
- Expression::Operation op, std::shared_ptr<UnboundTerm<B>> expr,
- std::initializer_list<Literal> values) {
- return Predicate<B>(op, std::move(expr), std::vector<Literal>(values));
-}
-
-template <typename B>
-std::shared_ptr<UnboundPredicateImpl<B>> Expressions::Predicate(
- Expression::Operation op, std::shared_ptr<UnboundTerm<B>> expr) {
- ICEBERG_ASSIGN_OR_THROW(auto pred, UnboundPredicateImpl<B>::Make(op,
std::move(expr)));
- return pred;
-}
-
// Constants
std::shared_ptr<True> Expressions::AlwaysTrue() { return True::Instance(); }
diff --git a/src/iceberg/expression/expressions.h
b/src/iceberg/expression/expressions.h
index cb1d6df7..92c523ca 100644
--- a/src/iceberg/expression/expressions.h
+++ b/src/iceberg/expression/expressions.h
@@ -27,7 +27,6 @@
#include <string>
#include <vector>
-#include "iceberg/exception.h"
#include "iceberg/expression/aggregate.h"
#include "iceberg/expression/literal.h"
#include "iceberg/expression/predicate.h"
@@ -152,7 +151,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create an IS NULL predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> IsNull(
- std::shared_ptr<UnboundTerm<B>> expr);
+ std::shared_ptr<UnboundTerm<B>> expr) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred,
+ UnboundPredicateImpl<B>::Make(Expression::Operation::kIsNull,
std::move(expr)));
+ return pred;
+ }
/// \brief Create a NOT NULL predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>>
NotNull(std::string name);
@@ -160,7 +164,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a NOT NULL predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> NotNull(
- std::shared_ptr<UnboundTerm<B>> expr);
+ std::shared_ptr<UnboundTerm<B>> expr) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred,
+ UnboundPredicateImpl<B>::Make(Expression::Operation::kNotNull,
std::move(expr)));
+ return pred;
+ }
/// \brief Create an IS NaN predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>>
IsNaN(std::string name);
@@ -168,7 +177,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create an IS NaN predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> IsNaN(
- std::shared_ptr<UnboundTerm<B>> expr);
+ std::shared_ptr<UnboundTerm<B>> expr) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred,
+ UnboundPredicateImpl<B>::Make(Expression::Operation::kIsNan,
std::move(expr)));
+ return pred;
+ }
/// \brief Create a NOT NaN predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>>
NotNaN(std::string name);
@@ -176,7 +190,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a NOT NaN predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> NotNaN(
- std::shared_ptr<UnboundTerm<B>> expr);
+ std::shared_ptr<UnboundTerm<B>> expr) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred,
+ UnboundPredicateImpl<B>::Make(Expression::Operation::kNotNan,
std::move(expr)));
+ return pred;
+ }
// Comparison predicates
@@ -187,7 +206,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a less than predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> LessThan(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value);
+ std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kLt,
+ std::move(expr),
std::move(value)));
+ return pred;
+ }
/// \brief Create a less than or equal predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>> LessThanOrEqual(
@@ -196,7 +220,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a less than or equal predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> LessThanOrEqual(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value);
+ std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kLtEq,
+ std::move(expr),
std::move(value)));
+ return pred;
+ }
/// \brief Create a greater than predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>> GreaterThan(
@@ -205,7 +234,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a greater than predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> GreaterThan(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value);
+ std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kGt,
+ std::move(expr),
std::move(value)));
+ return pred;
+ }
/// \brief Create a greater than or equal predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>>
GreaterThanOrEqual(
@@ -214,7 +248,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a greater than or equal predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> GreaterThanOrEqual(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value);
+ std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kGtEq,
+ std::move(expr),
std::move(value)));
+ return pred;
+ }
/// \brief Create an equal predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>>
Equal(std::string name,
@@ -223,7 +262,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create an equal predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> Equal(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value);
+ std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kEq,
+ std::move(expr),
std::move(value)));
+ return pred;
+ }
/// \brief Create a not equal predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>>
NotEqual(std::string name,
@@ -232,7 +276,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a not equal predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> NotEqual(
- std::shared_ptr<UnboundTerm<B>> expr, Literal value);
+ std::shared_ptr<UnboundTerm<B>> expr, Literal value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kNotEq,
+ std::move(expr),
std::move(value)));
+ return pred;
+ }
// String predicates
@@ -243,7 +292,13 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a starts with predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> StartsWith(
- std::shared_ptr<UnboundTerm<B>> expr, std::string value);
+ std::shared_ptr<UnboundTerm<B>> expr, std::string value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred,
+ UnboundPredicateImpl<B>::Make(Expression::Operation::kStartsWith,
std::move(expr),
+ Literal::String(std::move(value))));
+ return pred;
+ }
/// \brief Create a not starts with predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>> NotStartsWith(
@@ -252,7 +307,13 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a not starts with predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> NotStartsWith(
- std::shared_ptr<UnboundTerm<B>> expr, std::string value);
+ std::shared_ptr<UnboundTerm<B>> expr, std::string value) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred,
UnboundPredicateImpl<B>::Make(Expression::Operation::kNotStartsWith,
+ std::move(expr),
+
Literal::String(std::move(value))));
+ return pred;
+ }
// Set predicates
@@ -263,7 +324,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create an IN predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>>
In(std::shared_ptr<UnboundTerm<B>> expr,
- std::vector<Literal>
values);
+ std::vector<Literal>
values) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kIn,
+ std::move(expr),
std::move(values)));
+ return pred;
+ }
/// \brief Create an IN predicate for a field name with initializer list.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>> In(
@@ -272,7 +338,9 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create an IN predicate for an unbound term with initializer list.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> In(
- std::shared_ptr<UnboundTerm<B>> expr, std::initializer_list<Literal>
values);
+ std::shared_ptr<UnboundTerm<B>> expr, std::initializer_list<Literal>
values) {
+ return In<B>(std::move(expr), std::vector<Literal>(values));
+ }
/// \brief Create a NOT IN predicate for a field name.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>> NotIn(
@@ -281,7 +349,12 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a NOT IN predicate for an unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> NotIn(
- std::shared_ptr<UnboundTerm<B>> expr, std::vector<Literal> values);
+ std::shared_ptr<UnboundTerm<B>> expr, std::vector<Literal> values) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(Expression::Operation::kNotIn,
+ std::move(expr),
std::move(values)));
+ return pred;
+ }
/// \brief Create a NOT IN predicate for a field name with initializer list.
static std::shared_ptr<UnboundPredicateImpl<BoundReference>> NotIn(
@@ -290,7 +363,9 @@ class ICEBERG_EXPORT Expressions {
/// \brief Create a NOT IN predicate for an unbound term with initializer
list.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> NotIn(
- std::shared_ptr<UnboundTerm<B>> expr, std::initializer_list<Literal>
values);
+ std::shared_ptr<UnboundTerm<B>> expr, std::initializer_list<Literal>
values) {
+ return NotIn<B>(expr, std::vector<Literal>(values));
+ }
// Generic predicate factory
@@ -314,18 +389,28 @@ class ICEBERG_EXPORT Expressions {
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> Predicate(
Expression::Operation op, std::shared_ptr<UnboundTerm<B>> expr,
- std::vector<Literal> values);
+ std::vector<Literal> values) {
+ ICEBERG_ASSIGN_OR_THROW(
+ auto pred, UnboundPredicateImpl<B>::Make(op, std::move(expr),
std::move(values)));
+ return pred;
+ }
/// \brief Create a predicate with operation and multiple values.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> Predicate(
Expression::Operation op, std::shared_ptr<UnboundTerm<B>> expr,
- std::initializer_list<Literal> values);
+ std::initializer_list<Literal> values) {
+ return Predicate<B>(op, std::move(expr), std::vector<Literal>(values));
+ }
/// \brief Create a unary predicate for unbound term.
template <typename B>
static std::shared_ptr<UnboundPredicateImpl<B>> Predicate(
- Expression::Operation op, std::shared_ptr<UnboundTerm<B>> expr);
+ Expression::Operation op, std::shared_ptr<UnboundTerm<B>> expr) {
+ ICEBERG_ASSIGN_OR_THROW(auto pred,
+ UnboundPredicateImpl<B>::Make(op,
std::move(expr)));
+ return pred;
+ }
// Constants
diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc
b/src/iceberg/expression/inclusive_metrics_evaluator.cc
new file mode 100644
index 00000000..29f5aba2
--- /dev/null
+++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc
@@ -0,0 +1,521 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/expression/inclusive_metrics_evaluator.h"
+
+#include "iceberg/expression/binder.h"
+#include "iceberg/expression/expression_visitor.h"
+#include "iceberg/expression/rewrite_not.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/schema.h"
+#include "iceberg/transform.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+namespace {
+constexpr bool kRowsMightMatch = true;
+constexpr bool kRowCannotMatch = false;
+constexpr int32_t kInPredicateLimit = 200;
+} // namespace
+
+class InclusiveMetricsVisitor : public BoundVisitor<bool> {
+ public:
+ explicit InclusiveMetricsVisitor(const DataFile& data_file) :
data_file_(data_file) {}
+
+ Result<bool> AlwaysTrue() override { return kRowsMightMatch; }
+
+ Result<bool> AlwaysFalse() override { return kRowCannotMatch; }
+
+ Result<bool> Not(bool child_result) override { return !child_result; }
+
+ Result<bool> And(bool left_result, bool right_result) override {
+ return left_result && right_result;
+ }
+
+ Result<bool> Or(bool left_result, bool right_result) override {
+ return left_result || right_result;
+ }
+
+ Result<bool> IsNull(const std::shared_ptr<Bound>& expr) override {
+ // no need to check whether the field is required because binding
evaluates that case
+ // if the column has no null values, the expression cannot match
+ if (IsNonNullPreserving(expr)) {
+ // number of non-nulls is the same as for the ref
+ int32_t id = expr->reference()->field().field_id();
+ if (!MayContainNull(id)) {
+ return kRowCannotMatch;
+ }
+ }
+ return kRowsMightMatch;
+ }
+
+ Result<bool> NotNull(const std::shared_ptr<Bound>& expr) override {
+ // no need to check whether the field is required because binding
evaluates that case
+ // if the column has no non-null values, the expression cannot match
+
+ // all terms are null preserving. see #isNullPreserving(Bound)
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id)) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> IsNaN(const std::shared_ptr<Bound>& expr) override {
+ // when there's no nanCounts information, but we already know the column
only contains
+ // null, it's guaranteed that there's no NaN value
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id)) {
+ return kRowCannotMatch;
+ }
+ if (dynamic_cast<const BoundReference*>(expr.get()) == nullptr) {
+ return kRowsMightMatch;
+ }
+ auto it = data_file_.nan_value_counts.find(id);
+ if (it != data_file_.nan_value_counts.end() && it->second == 0) {
+ return kRowCannotMatch;
+ }
+ return kRowsMightMatch;
+ }
+
+ Result<bool> NotNaN(const std::shared_ptr<Bound>& expr) override {
+ if (dynamic_cast<const BoundReference*>(expr.get()) == nullptr) {
+ // identity transforms are already removed by this time
+ return kRowsMightMatch;
+ }
+
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNaNsOnly(id)) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> Lt(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ // all terms are null preserving. see #isNullPreserving(Bound)
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowCannotMatch;
+ }
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr));
+ if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) {
+ // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator
docs for more.
+ return kRowsMightMatch;
+ }
+
+ // this also works for transforms that are order preserving:
+ // if a transform f is order preserving, a < b means that f(a) <= f(b).
+ // because lower <= a for all values of a in the file, f(lower) <= f(a).
+ // when f(lower) >= X then f(a) >= f(lower) >= X, so there is no a such
that f(a) < X
+ // f(lower) >= X means rows cannot match
+ if (lower.value() >= lit) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> LtEq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ // all terms are null preserving. see #isNullPreserving(Bound)
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowCannotMatch;
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr));
+ if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) {
+ // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator
docs for more.
+ return kRowsMightMatch;
+ }
+
+ // this also works for transforms that are order preserving:
+ // if a transform f is order preserving, a < b means that f(a) <= f(b).
+ // because lower <= a for all values of a in the file, f(lower) <= f(a).
+ // when f(lower) > X then f(a) >= f(lower) > X, so there is no a such that
f(a) <= X
+ // f(lower) > X means rows cannot match
+ if (lower.value() > lit) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> Gt(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ // all terms are null preserving. see #isNullPreserving(Bound)
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowCannotMatch;
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr));
+ if (!upper.has_value() || upper->IsNull()) {
+ return kRowsMightMatch;
+ }
+ if (upper.value() <= lit) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> GtEq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ // all terms are null preserving. see #isNullPreserving(Bound)
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowCannotMatch;
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr));
+ if (!upper.has_value() || upper->IsNull()) {
+ return kRowsMightMatch;
+ }
+ if (upper.value() < lit) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> Eq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ // all terms are null preserving. see #isNullPreserving(Bound)
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowCannotMatch;
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr));
+ if (lower.has_value() && !lower->IsNull() && !lower->IsNaN()) {
+ if (lower.value() > lit) {
+ return kRowCannotMatch;
+ }
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr));
+ if (!upper.has_value() || upper->IsNull()) {
+ return kRowsMightMatch;
+ }
+ if (upper.value() < lit) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> NotEq(const std::shared_ptr<Bound>& expr, const Literal& lit)
override {
+ // because the bounds are not necessarily a min or max value, this cannot
be answered
+ // using them. notEq(col, X) with (X, Y) doesn't guarantee that X is a
value in col.
+ return kRowsMightMatch;
+ }
+
+ Result<bool> In(const std::shared_ptr<Bound>& expr,
+ const BoundSetPredicate::LiteralSet& literal_set) override {
+ // all terms are null preserving. see #isNullPreserving(Bound)
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+ return kRowCannotMatch;
+ }
+
+ if (literal_set.size() > kInPredicateLimit) {
+ // skip evaluating the predicate if the number of values is too big
+ return kRowsMightMatch;
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr));
+ if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) {
+ // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator
docs for more.
+ return kRowsMightMatch;
+ }
+ auto literals_view = literal_set | std::views::filter([&](const Literal&
lit) {
+ return lower.value() <= lit;
+ });
+ // if all values are less than lower bound, rows cannot match
+ if (literals_view.empty()) {
+ return kRowCannotMatch;
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr));
+ if (!upper.has_value() || upper->IsNull()) {
+ return kRowsMightMatch;
+ }
+ auto filtered_view = literals_view | std::views::filter([&](const Literal&
lit) {
+ return upper.value() >= lit;
+ });
+ // if remaining values are greater than upper bound, rows cannot match
+ if (filtered_view.empty()) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> NotIn(const std::shared_ptr<Bound>& expr,
+ const BoundSetPredicate::LiteralSet& literal_set)
override {
+ // because the bounds are not necessarily a min or max value, this cannot
be answered
+ // using them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X
is a value in
+ // col.
+ return kRowsMightMatch;
+ }
+
+ Result<bool> StartsWith(const std::shared_ptr<Bound>& expr,
+ const Literal& lit) override {
+ if (auto transform = dynamic_cast<const BoundTransform*>(expr.get());
+ transform != nullptr &&
+ transform->transform()->transform_type() != TransformType::kIdentity) {
+ // truncate must be rewritten in binding. the result is either always or
never
+ // compatible
+ return kRowsMightMatch;
+ }
+
+ int32_t id = expr->reference()->field().field_id();
+ if (ContainsNullsOnly(id)) {
+ return kRowCannotMatch;
+ }
+ if (lit.type()->type_id() != TypeId::kString) {
+ return kRowCannotMatch;
+ }
+ const auto& prefix = std::get<std::string>(lit.value());
+
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr));
+ if (!lower.has_value() || lower->IsNull()) {
+ return kRowsMightMatch;
+ }
+ const auto& lower_str = std::get<std::string>(lower->value());
+ // truncate lower bound so that its length in bytes is not greater than
the length of
+ // prefix
+ size_t length = std::min(prefix.size(), lower_str.size());
+ // if prefix of lower bound is greater than prefix, rows cannot match
+ if (lower_str.substr(0, length) > prefix) {
+ return kRowCannotMatch;
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr));
+ if (!upper.has_value() || upper->IsNull()) {
+ return kRowsMightMatch;
+ }
+ const auto& upper_str = std::get<std::string>(upper->value());
+ // truncate upper bound so that its length in bytes is not greater than
the length of
+ // prefix
+ length = std::min(prefix.size(), upper_str.size());
+ // if prefix of upper bound is less than prefix, rows cannot match
+ if (upper_str.substr(0, length) < prefix) {
+ return kRowCannotMatch;
+ }
+
+ return kRowsMightMatch;
+ }
+
+ Result<bool> NotStartsWith(const std::shared_ptr<Bound>& expr,
+ const Literal& lit) override {
+ // the only transforms that produce strings are truncate and identity,
which work with
+ // this
+ int32_t id = expr->reference()->field().field_id();
+ if (MayContainNull(id)) {
+ return kRowsMightMatch;
+ }
+
+ if (lit.type()->type_id() != TypeId::kString) {
+ return kRowCannotMatch;
+ }
+ const auto& prefix = std::get<std::string>(lit.value());
+
+ // notStartsWith will match unless all values must start with the prefix.
This happens
+ // when the lower and upper bounds both start with the prefix.
+ ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr));
+ ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr));
+ if (!lower.has_value() || lower->IsNull() || !upper.has_value() ||
upper->IsNull()) {
+ return kRowsMightMatch;
+ }
+ const auto& lower_str = std::get<std::string>(lower->value());
+ const auto& upper_str = std::get<std::string>(upper->value());
+
+ // if lower is shorter than the prefix then lower doesn't start with the
prefix
+ if (lower_str.size() < prefix.size()) {
+ return kRowsMightMatch;
+ }
+
+ if (lower_str.starts_with(prefix)) {
+ // if upper is shorter than the prefix then upper can't start with the
prefix
+ if (upper_str.size() < prefix.size()) {
+ return kRowsMightMatch;
+ }
+ if (upper_str.starts_with(prefix)) {
+ // both bounds match the prefix, so all rows must match the prefix and
therefore
+ // do not satisfy the predicate
+ return kRowCannotMatch;
+ }
+ }
+
+ return kRowsMightMatch;
+ }
+
+ private:
+ bool MayContainNull(int32_t id) {
+ return data_file_.null_value_counts.empty() ||
+ !data_file_.null_value_counts.contains(id) ||
+ data_file_.null_value_counts.at(id) != 0;
+ }
+
+ bool ContainsNullsOnly(int32_t id) {
+ auto val_it = data_file_.value_counts.find(id);
+ auto null_it = data_file_.null_value_counts.find(id);
+ return val_it != data_file_.value_counts.cend() &&
+ null_it != data_file_.null_value_counts.cend() &&
+ val_it->second == null_it->second;
+ }
+
+ bool ContainsNaNsOnly(int32_t id) {
+ auto val_it = data_file_.value_counts.find(id);
+ auto nan_it = data_file_.nan_value_counts.find(id);
+ return val_it != data_file_.value_counts.cend() &&
+ nan_it != data_file_.nan_value_counts.cend() &&
+ val_it->second == nan_it->second;
+ }
+
+ Result<std::optional<Literal>> LowerBound(const std::shared_ptr<Bound>&
expr) {
+ if (auto reference = dynamic_cast<const BoundReference*>(expr.get());
+ reference != nullptr) {
+ return ParseLowerBound(*reference);
+ } else if (auto transform = dynamic_cast<BoundTransform*>(expr.get());
+ transform != nullptr) {
+ return TransformLowerBound(*transform);
+ } else {
+ return std::nullopt;
+ }
+ // TODO(xiao.dong) handle extract lower and upper bounds
+ }
+
+ Result<std::optional<Literal>> UpperBound(const std::shared_ptr<Bound>&
expr) {
+ if (auto reference = dynamic_cast<const BoundReference*>(expr.get());
+ reference != nullptr) {
+ return ParseUpperBound(*reference);
+ } else if (auto transform = dynamic_cast<BoundTransform*>(expr.get());
+ transform != nullptr) {
+ return TransformUpperBound(*transform);
+ } else {
+ return std::nullopt;
+ }
+ // TODO(xiao.dong) handle extract lower and upper bounds
+ }
+
+ Result<std::optional<Literal>> ParseLowerBound(const BoundReference& ref) {
+ int32_t id = ref.field().field_id();
+ auto type = ref.type();
+ if (!type->is_primitive()) {
+ return NotSupported("Lower bound of non-primitive type is not
supported.");
+ }
+ auto primitive_type = internal::checked_pointer_cast<PrimitiveType>(type);
+ if (data_file_.lower_bounds.contains(id)) {
+ return Literal::Deserialize(data_file_.lower_bounds.at(id),
primitive_type);
+ }
+
+ return std::nullopt;
+ }
+
+ Result<std::optional<Literal>> ParseUpperBound(const BoundReference& ref) {
+ int32_t id = ref.field().field_id();
+ auto type = ref.type();
+ if (!type->is_primitive()) {
+ return NotSupported("Upper bound of non-primitive type is not
supported.");
+ }
+ auto primitive_type = internal::checked_pointer_cast<PrimitiveType>(type);
+ if (data_file_.upper_bounds.contains(id)) {
+ return Literal::Deserialize(data_file_.upper_bounds.at(id),
primitive_type);
+ }
+
+ return std::nullopt;
+ }
+
+ Result<std::optional<Literal>> TransformLowerBound(BoundTransform&
boundTransform) {
+ auto transform = boundTransform.transform();
+ if (transform->PreservesOrder()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto lower,
ParseLowerBound(*boundTransform.reference()));
+ if (lower.has_value()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto transform_func,
+
transform->Bind(boundTransform.reference()->type()));
+ return transform_func->Transform(lower.value());
+ }
+ }
+
+ return std::nullopt;
+ }
+
+ Result<std::optional<Literal>> TransformUpperBound(BoundTransform&
boundTransform) {
+ auto transform = boundTransform.transform();
+ if (transform->PreservesOrder()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto upper,
ParseUpperBound(*boundTransform.reference()));
+ if (upper.has_value()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto transform_func,
+
transform->Bind(boundTransform.reference()->type()));
+ return transform_func->Transform(upper.value());
+ }
+ }
+
+ return std::nullopt;
+ }
+
+ /** Returns true if the expression term produces a non-null value for
non-null input. */
+ bool IsNonNullPreserving(const std::shared_ptr<Bound>& expr) {
+ if (auto reference = dynamic_cast<const BoundReference*>(expr.get());
+ reference != nullptr) {
+ return true;
+ } else if (auto transform = dynamic_cast<BoundTransform*>(expr.get());
+ transform != nullptr) {
+ return transform->transform()->PreservesOrder();
+ }
+ // a non-null variant does not necessarily contain a specific field
+ // and unknown bound terms are not non-null preserving
+ return false;
+ }
+
+ private:
+ const DataFile& data_file_;
+};
+
+InclusiveMetricsEvaluator::InclusiveMetricsEvaluator(std::shared_ptr<Expression>
expr)
+ : expr_(std::move(expr)) {}
+
+InclusiveMetricsEvaluator::~InclusiveMetricsEvaluator() = default;
+
+Result<std::unique_ptr<InclusiveMetricsEvaluator>>
InclusiveMetricsEvaluator::Make(
+ std::shared_ptr<Expression> expr, const Schema& schema, bool
case_sensitive) {
+ ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr,
RewriteNot::Visit(std::move(expr)));
+ ICEBERG_ASSIGN_OR_RAISE(auto bound_expr,
+ Binder::Bind(schema, rewrite_expr, case_sensitive));
+ return std::unique_ptr<InclusiveMetricsEvaluator>(
+ new InclusiveMetricsEvaluator(std::move(bound_expr)));
+}
+
+Result<bool> InclusiveMetricsEvaluator::Evaluate(const DataFile& data_file)
const {
+ if (data_file.record_count == 0) {
+ return kRowCannotMatch;
+ }
+ if (data_file.record_count < 0) {
+ // we haven't implemented parsing record count from avro file and thus set
record
+ // count -1 when importing avro tables to iceberg tables. This should be
updated once
+ // we implemented and set correct record count.
+ return kRowsMightMatch;
+ }
+ InclusiveMetricsVisitor visitor(data_file);
+ return Visit<bool, InclusiveMetricsVisitor>(expr_, visitor);
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.h
b/src/iceberg/expression/inclusive_metrics_evaluator.h
new file mode 100644
index 00000000..1887b339
--- /dev/null
+++ b/src/iceberg/expression/inclusive_metrics_evaluator.h
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/expression/inclusive_metrics_evaluator.h
+///
+/// Evaluates an Expression on a DataFile to test whether rows in the file may
match.
+///
+/// This evaluation is inclusive: it returns true if a file may match and
false if it
+/// cannot match.
+///
+/// Files are passed to #eval(ContentFile), which returns true if the file may
contain
+/// matching rows and false if the file cannot contain matching rows. Files
may be skipped
+/// if and only if the return value of eval is false.
+///
+/// Due to the comparison implementation of ORC stats, for float/double
columns in ORC
+/// files, if the first value in a file is NaN, metrics of this file will
report NaN for
+/// both upper and lower bound despite that the column could contain non-NaN
data. Thus in
+/// some scenarios explicitly checks for NaN is necessary in order to not skip
files that
+/// may contain matching data.
+///
+
+#include <memory>
+
+#include "iceberg/expression/expression.h"
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+namespace iceberg {
+
+class ICEBERG_EXPORT InclusiveMetricsEvaluator {
+ public:
+ /// \brief Make a inclusive metrics evaluator
+ ///
+ /// \param expr The expression to evaluate
+ /// \param schema The schema of the table
+ /// \param case_sensitive Whether field name matching is case-sensitive
+ static Result<std::unique_ptr<InclusiveMetricsEvaluator>> Make(
+ std::shared_ptr<Expression> expr, const Schema& schema, bool
case_sensitive = true);
+
+ ~InclusiveMetricsEvaluator();
+
+ /// \brief Evaluate the expression against a DataFile.
+ ///
+ /// \param data_file The data file to evaluate
+ /// \return true if the file matches the expression, false otherwise, or
error
+ Result<bool> Evaluate(const DataFile& data_file) const;
+
+ private:
+ explicit InclusiveMetricsEvaluator(std::shared_ptr<Expression> expr);
+
+ std::shared_ptr<Expression> expr_;
+};
+
+} // namespace iceberg
diff --git a/src/iceberg/expression/literal.cc
b/src/iceberg/expression/literal.cc
index c1aad90d..cb0a4c6d 100644
--- a/src/iceberg/expression/literal.cc
+++ b/src/iceberg/expression/literal.cc
@@ -343,12 +343,32 @@ std::strong_ordering CompareFloat(T lhs, T rhs) {
return lhs_is_negative <=> rhs_is_negative;
}
+namespace {
+
+bool Comparable(TypeId lhs, TypeId rhs) {
+ switch (lhs) {
+ case TypeId::kInt:
+ case TypeId::kDate:
+ return rhs == TypeId::kInt || rhs == TypeId::kDate;
+ case TypeId::kLong:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz:
+ return rhs == TypeId::kLong || rhs == TypeId::kTimestamp ||
+ rhs == TypeId::kTimestampTz;
+ default:
+ return lhs == rhs;
+ }
+}
+
+} // namespace
+
bool Literal::operator==(const Literal& other) const { return (*this <=>
other) == 0; }
// Three-way comparison operator
std::partial_ordering Literal::operator<=>(const Literal& other) const {
// If types are different, comparison is unordered
- if (type_->type_id() != other.type_->type_id()) {
+ // (Int & Date) (Timestamp & Long) were excluded from this check to allow
comparison
+ if (!Comparable(type_->type_id(), other.type_->type_id())) {
return std::partial_ordering::unordered;
}
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index d52739be..05397179 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -47,6 +47,7 @@ iceberg_sources = files(
'expression/evaluator.cc',
'expression/expression.cc',
'expression/expressions.cc',
+ 'expression/inclusive_metrics_evaluator.cc',
'expression/literal.cc',
'expression/predicate.cc',
'expression/rewrite_not.cc',
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 1a7e61ac..0b4793ac 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -94,6 +94,8 @@ add_iceberg_test(expression_test
expression_test.cc
expression_visitor_test.cc
literal_test.cc
+ inclusive_metrics_evaluator_test.cc
+ inclusive_metrics_evaluator_with_transform_test.cc
predicate_test.cc)
add_iceberg_test(json_serde_test
diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc
b/src/iceberg/test/inclusive_metrics_evaluator_test.cc
new file mode 100644
index 00000000..27867f1a
--- /dev/null
+++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc
@@ -0,0 +1,948 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/expression/inclusive_metrics_evaluator.h"
+
+#include <gtest/gtest.h>
+
+#include "iceberg/expression/binder.h"
+#include "iceberg/expression/expressions.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/schema.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
+#include "iceberg/util/truncate_util.h"
+
+namespace iceberg {
+
+namespace {
+constexpr bool kRowsMightMatch = true;
+constexpr bool kRowCannotMatch = false;
+constexpr int64_t kIntMinValue = 30;
+constexpr int64_t kIntMaxValue = 79;
+constexpr float kFloatNan = std::numeric_limits<float>::quiet_NaN();
+constexpr double kDoubleNan = std::numeric_limits<double>::quiet_NaN();
+} // namespace
+using TestVariant = std::variant<bool, int32_t, int64_t, double, std::string>;
+
+class InclusiveMetricsEvaluatorTest : public ::testing::Test {
+ protected:
+ Result<std::shared_ptr<Expression>> Bind(const std::shared_ptr<Expression>&
expr,
+ bool case_sensitive = true) {
+ return Binder::Bind(*schema_, expr, case_sensitive);
+ }
+
+ void SetUp() override {
+ schema_ = std::make_shared<Schema>(
+ std::vector<SchemaField>{
+ SchemaField::MakeRequired(1, "id", int64()),
+ SchemaField::MakeOptional(2, "name", string()),
+ SchemaField::MakeRequired(3, "age", int32()),
+ SchemaField::MakeOptional(4, "salary", float64()),
+ SchemaField::MakeRequired(5, "active", boolean()),
+ SchemaField::MakeRequired(6, "date", string()),
+ },
+ 0);
+ }
+
+ std::shared_ptr<DataFile> PrepareDataFile(
+ const std::string& partition, int64_t record_count, int64_t
file_size_in_bytes,
+ const std::map<std::string, TestVariant>& lower_bounds,
+ const std::map<std::string, TestVariant>& upper_bounds,
+ const std::map<int32_t, int64_t>& value_counts = {},
+ const std::map<int32_t, int64_t>& null_counts = {},
+ const std::map<int32_t, int64_t>& nan_counts = {}) {
+ auto parse_bound = [&](const std::map<std::string, TestVariant>& bounds,
+ std::map<int32_t, std::vector<uint8_t>>&
bound_values) {
+ for (const auto& [key, value] : bounds) {
+ if (key == "id") {
+ bound_values[1] =
Literal::Long(std::get<int64_t>(value)).Serialize().value();
+ } else if (key == "name") {
+ bound_values[2] =
+
Literal::String(std::get<std::string>(value)).Serialize().value();
+ } else if (key == "age") {
+ bound_values[3] =
Literal::Int(std::get<int32_t>(value)).Serialize().value();
+ } else if (key == "salary") {
+ bound_values[4] =
Literal::Double(std::get<double>(value)).Serialize().value();
+ } else if (key == "active") {
+ bound_values[5] =
Literal::Boolean(std::get<bool>(value)).Serialize().value();
+ }
+ }
+ };
+
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->partition.AddValue(Literal::String(partition));
+ data_file->record_count = record_count;
+ data_file->file_size_in_bytes = file_size_in_bytes;
+ data_file->column_sizes = {};
+ data_file->value_counts = value_counts;
+ data_file->null_value_counts = null_counts;
+ data_file->nan_value_counts = nan_counts;
+ data_file->split_offsets = {1};
+ data_file->sort_order_id = 0;
+ parse_bound(upper_bounds, data_file->upper_bounds);
+ parse_bound(lower_bounds, data_file->lower_bounds);
+ return data_file;
+ }
+
+ void TestCase(const std::shared_ptr<Expression>& unbound, bool
expected_result) {
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"id",
static_cast<int64_t>(100)}},
+ {{"id", static_cast<int64_t>(200)}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+ }
+
+ void TestStringCase(const std::shared_ptr<Expression>& unbound, bool
expected_result) {
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}},
+ {{"name", "456"}}, {{2, 10}}, {{2, 0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+ }
+
+ protected:
+ std::shared_ptr<Schema> schema_;
+};
+
+TEST_F(InclusiveMetricsEvaluatorTest, CaseSensitiveTest) {
+ {
+ auto unbound = Expressions::Equal("id", Literal::Long(300));
+ auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true);
+ ASSERT_TRUE(evaluator.has_value());
+ }
+ {
+ auto unbound = Expressions::Equal("ID", Literal::Long(300));
+ auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true);
+ ASSERT_FALSE(evaluator.has_value());
+ ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression);
+ }
+ {
+ auto unbound = Expressions::Equal("ID", Literal::Long(300));
+ auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, false);
+ ASSERT_TRUE(evaluator.has_value());
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) {
+ {
+ auto unbound = Expressions::IsNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 5}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::IsNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 0}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) {
+ {
+ auto unbound = Expressions::NotNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 5}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::NotNull("name");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}},
{{"name", "2"}},
+ {{2, 10}}, {{2, 10}}, {});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) {
+ {
+ auto unbound = Expressions::IsNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4,
5}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::IsNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4,
5}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::IsNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4,
0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) {
+ {
+ auto unbound = Expressions::NotNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString();
+ }
+ {
+ auto unbound = Expressions::NotNaN("salary");
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+ {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 10}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString();
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, LTTest) {
+ TestCase(Expressions::LessThan("id", Literal::Long(300)), kRowsMightMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(150)), kRowsMightMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(100)), kRowCannotMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(200)), kRowsMightMatch);
+ TestCase(Expressions::LessThan("id", Literal::Long(99)), kRowCannotMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, LTEQTest) {
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(300)),
kRowsMightMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(150)),
kRowsMightMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(100)),
kRowsMightMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(200)),
kRowsMightMatch);
+ TestCase(Expressions::LessThanOrEqual("id", Literal::Long(99)),
kRowCannotMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, GTTest) {
+ TestCase(Expressions::GreaterThan("id", Literal::Long(300)),
kRowCannotMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(150)),
kRowsMightMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(100)),
kRowsMightMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(200)),
kRowCannotMatch);
+ TestCase(Expressions::GreaterThan("id", Literal::Long(99)), kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, GTEQTest) {
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(300)),
kRowCannotMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(150)),
kRowsMightMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(100)),
kRowsMightMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(200)),
kRowsMightMatch);
+ TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(99)),
kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, EQTest) {
+ TestCase(Expressions::Equal("id", Literal::Long(300)), kRowCannotMatch);
+ TestCase(Expressions::Equal("id", Literal::Long(150)), kRowsMightMatch);
+ TestCase(Expressions::Equal("id", Literal::Long(100)), kRowsMightMatch);
+ TestCase(Expressions::Equal("id", Literal::Long(200)), kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, NotEqTest) {
+ TestCase(Expressions::NotEqual("id", Literal::Long(300)), kRowsMightMatch);
+ TestCase(Expressions::NotEqual("id", Literal::Long(150)), kRowsMightMatch);
+ TestCase(Expressions::NotEqual("id", Literal::Long(100)), kRowsMightMatch);
+ TestCase(Expressions::NotEqual("id", Literal::Long(200)), kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, InTest) {
+ TestCase(Expressions::In("id",
+ {
+ Literal::Long(300),
+ Literal::Long(400),
+ Literal::Long(500),
+ }),
+ kRowCannotMatch);
+ TestCase(Expressions::In("id",
+ {
+ Literal::Long(150),
+ Literal::Long(300),
+ }),
+ kRowsMightMatch);
+ TestCase(Expressions::In("id", {Literal::Long(100)}), kRowsMightMatch);
+ TestCase(Expressions::In("id", {Literal::Long(200)}), kRowsMightMatch);
+ TestCase(Expressions::In("id",
+ {
+ Literal::Long(99),
+ Literal::Long(201),
+ }),
+ kRowCannotMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, NotInTest) {
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(300),
+ Literal::Long(400),
+ Literal::Long(500),
+ }),
+ kRowsMightMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(150),
+ Literal::Long(300),
+ }),
+ kRowsMightMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(100),
+ Literal::Long(200),
+ }),
+ kRowsMightMatch);
+ TestCase(Expressions::NotIn("id",
+ {
+ Literal::Long(99),
+ Literal::Long(201),
+ }),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, StartsWithTest) {
+ TestStringCase(Expressions::StartsWith("name", "1"), kRowsMightMatch);
+ TestStringCase(Expressions::StartsWith("name", "4"), kRowsMightMatch);
+ TestStringCase(Expressions::StartsWith("name", "12"), kRowsMightMatch);
+ TestStringCase(Expressions::StartsWith("name", "45"), kRowsMightMatch);
+ TestStringCase(Expressions::StartsWith("name", "123"), kRowsMightMatch);
+ TestStringCase(Expressions::StartsWith("name", "456"), kRowsMightMatch);
+ TestStringCase(Expressions::StartsWith("name", "1234"), kRowsMightMatch);
+ TestStringCase(Expressions::StartsWith("name", "4567"), kRowCannotMatch);
+ TestStringCase(Expressions::StartsWith("name", "78"), kRowCannotMatch);
+ TestStringCase(Expressions::StartsWith("name", "7"), kRowCannotMatch);
+ TestStringCase(Expressions::StartsWith("name", "A"), kRowCannotMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorTest, NotStartsWithTest) {
+ TestStringCase(Expressions::NotStartsWith("name", "1"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "4"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "12"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "45"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "123"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "456"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "1234"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "4567"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "78"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "7"), kRowsMightMatch);
+ TestStringCase(Expressions::NotStartsWith("name", "A"), kRowsMightMatch);
+
+ auto RunTest = [&](const std::string& prefix, bool expected_result) {
+ auto unbound = Expressions::NotStartsWith("name", prefix);
+ ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+ InclusiveMetricsEvaluator::Make(unbound, *schema_,
true));
+ auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}},
+ {{"name", "123"}}, {{2, 10}}, {{2, 0}});
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+ };
+ RunTest("12", kRowCannotMatch);
+ RunTest("123", kRowCannotMatch);
+ RunTest("1234", kRowsMightMatch);
+}
+
+class InclusiveMetricsEvaluatorMigratedTest : public
InclusiveMetricsEvaluatorTest {
+ protected:
+ void SetUp() override {
+ schema_ = std::make_shared<Schema>(
+ std::vector<SchemaField>{
+ SchemaField::MakeRequired(1, "id", int64()),
+ SchemaField::MakeOptional(2, "no_stats", int64()),
+ SchemaField::MakeRequired(3, "required", string()),
+ SchemaField::MakeOptional(4, "all_nulls", string()),
+ SchemaField::MakeOptional(5, "some_nulls", string()),
+ SchemaField::MakeOptional(6, "no_nulls", string()),
+ SchemaField::MakeOptional(7, "all_nans", float64()),
+ SchemaField::MakeOptional(8, "some_nans", float32()),
+ SchemaField::MakeOptional(9, "no_nans", float32()),
+ SchemaField::MakeOptional(10, "all_nulls_double", float64()),
+ SchemaField::MakeOptional(11, "all_nans_v1_stats", float32()),
+ SchemaField::MakeOptional(12, "nan_and_null_only", float64()),
+ SchemaField::MakeOptional(13, "no_nan_stats", float64()),
+ SchemaField::MakeOptional(14, "some_empty", string()),
+ },
+ /*schema_id=*/0);
+ file1_ = PrepareDataFile1();
+ file2_ = PrepareDataFile2();
+ file3_ = PrepareDataFile3();
+ file4_ = PrepareDataFile4();
+ file5_ = PrepareDataFile5();
+ }
+
+ std::shared_ptr<DataFile> PrepareDataFile1() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path1";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {4, 50L}, {5, 50L}, {6, 50L}, {7, 50L}, {8, 50L}, {9, 50L},
+ {10, 50L}, {11, 50L}, {12, 50L}, {13, 50L}, {14, 50L},
+ };
+ data_file->null_value_counts = {
+ {4, 50L}, {5, 10L}, {6, 0L}, {10, 50L}, {11, 0L}, {12, 1L}, {14, 0L},
+ };
+ data_file->nan_value_counts = {
+ {7, 50L},
+ {8, 10L},
+ {9, 0L},
+ };
+ data_file->lower_bounds = {
+ {1, Literal::Long(kIntMinValue).Serialize().value()},
+ {11, Literal::Float(kFloatNan).Serialize().value()},
+ {12, Literal::Double(kDoubleNan).Serialize().value()},
+ {14, Literal::String("").Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {1, Literal::Long(kIntMaxValue).Serialize().value()},
+ {11, Literal::Float(kFloatNan).Serialize().value()},
+ {12, Literal::Double(kDoubleNan).Serialize().value()},
+ {14, Literal::String("房东整租霍营小区二层两居室").Serialize().value()},
+ };
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> PrepareDataFile2() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path2";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {3, 50L},
+ };
+ data_file->null_value_counts = {
+ {3, 0L},
+ };
+ data_file->nan_value_counts = {};
+ data_file->lower_bounds = {
+ {3, Literal::String("aa").Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {3, Literal::String("dC").Serialize().value()},
+ };
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> PrepareDataFile3() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path3";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {3, 50L},
+ };
+ data_file->null_value_counts = {
+ {3, 0L},
+ };
+ data_file->nan_value_counts = {};
+ data_file->lower_bounds = {
+ {3, Literal::String("1str1").Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {3, Literal::String("3str3").Serialize().value()},
+ };
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> PrepareDataFile4() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path4";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {3, 50L},
+ };
+ data_file->null_value_counts = {
+ {3, 0L},
+ };
+ data_file->nan_value_counts = {};
+ data_file->lower_bounds = {
+ {3, Literal::String("abc").Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {3, Literal::String("イロハニホヘト").Serialize().value()},
+ };
+ return data_file;
+ }
+
+ std::shared_ptr<DataFile> PrepareDataFile5() {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path5";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+ data_file->value_counts = {
+ {3, 50L},
+ };
+ data_file->null_value_counts = {
+ {3, 0L},
+ };
+ data_file->nan_value_counts = {};
+ data_file->lower_bounds = {
+ {3, Literal::String("abc").Serialize().value()},
+ };
+ data_file->upper_bounds = {
+ {3, Literal::String("abcdefghi").Serialize().value()},
+ };
+ return data_file;
+ }
+
+ void RunTest(const std::shared_ptr<Expression>& expr, bool expected_result,
+ const std::shared_ptr<DataFile>& file, bool case_sensitive =
true) {
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_,
case_sensitive));
+ auto result = evaluator->Evaluate(*file);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), expected_result) << expr->ToString();
+ };
+
+ std::shared_ptr<DataFile> file1_;
+ std::shared_ptr<DataFile> file2_;
+ std::shared_ptr<DataFile> file3_;
+ std::shared_ptr<DataFile> file4_;
+ std::shared_ptr<DataFile> file5_;
+};
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, CaseSensitiveTest) {
+ {
+ auto unbound = Expressions::Equal("id", Literal::Long(300));
+ auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true);
+ ASSERT_TRUE(evaluator.has_value());
+ }
+ {
+ auto unbound = Expressions::Equal("ID", Literal::Long(300));
+ auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true);
+ ASSERT_FALSE(evaluator.has_value());
+ ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression);
+ }
+ {
+ auto unbound = Expressions::Equal("ID", Literal::Long(300));
+ auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, false);
+ ASSERT_TRUE(evaluator.has_value());
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, AllNullsTest) {
+ RunTest(Expressions::NotNull("all_nulls"), kRowCannotMatch, file1_);
+ RunTest(Expressions::LessThan("all_nulls", Literal::String("a")),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::LessThanOrEqual("all_nulls", Literal::String("a")),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::GreaterThan("all_nulls", Literal::String("a")),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::GreaterThanOrEqual("all_nulls", Literal::String("a")),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::Equal("all_nulls", Literal::String("a")),
kRowCannotMatch, file1_);
+ RunTest(Expressions::StartsWith("all_nulls", "a"), kRowCannotMatch, file1_);
+ RunTest(Expressions::NotStartsWith("all_nulls", "a"), kRowsMightMatch,
file1_);
+ RunTest(Expressions::NotNull("some_nulls"), kRowsMightMatch, file1_);
+ RunTest(Expressions::NotNull("no_nulls"), kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, NoNullsTest) {
+ RunTest(Expressions::IsNull("all_nulls"), kRowsMightMatch, file1_);
+ RunTest(Expressions::IsNull("some_nulls"), kRowsMightMatch, file1_);
+ RunTest(Expressions::IsNull("no_nulls"), kRowCannotMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IsNaNTest) {
+ RunTest(Expressions::IsNaN("all_nans"), kRowsMightMatch, file1_);
+ RunTest(Expressions::IsNaN("some_nans"), kRowsMightMatch, file1_);
+ RunTest(Expressions::IsNaN("no_nans"), kRowCannotMatch, file1_);
+ RunTest(Expressions::IsNaN("all_nulls_double"), kRowCannotMatch, file1_);
+ RunTest(Expressions::IsNaN("no_nan_stats"), kRowsMightMatch, file1_);
+ RunTest(Expressions::IsNaN("all_nans_v1_stats"), kRowsMightMatch, file1_);
+ RunTest(Expressions::IsNaN("nan_and_null_only"), kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotNaNTest) {
+ RunTest(Expressions::NotNaN("all_nans"), kRowCannotMatch, file1_);
+ RunTest(Expressions::NotNaN("some_nans"), kRowsMightMatch, file1_);
+ RunTest(Expressions::NotNaN("no_nans"), kRowsMightMatch, file1_);
+ RunTest(Expressions::NotNaN("all_nulls_double"), kRowsMightMatch, file1_);
+ RunTest(Expressions::NotNaN("no_nan_stats"), kRowsMightMatch, file1_);
+ RunTest(Expressions::NotNaN("all_nans_v1_stats"), kRowsMightMatch, file1_);
+ RunTest(Expressions::NotNaN("nan_and_null_only"), kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, RequiredColumnTest) {
+ RunTest(Expressions::NotNull("required"), kRowsMightMatch, file1_);
+ RunTest(Expressions::IsNull("required"), kRowCannotMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, MissingColumnTest) {
+ auto expr = Expressions::LessThan("missing", Literal::Long(5));
+ auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true);
+ ASSERT_FALSE(result.has_value()) << result.error().message;
+ ASSERT_TRUE(result.error().message.contains("Cannot find field 'missing' in
struct"))
+ << result.error().message;
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, MissingStatsTest) {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 50;
+
+ RunTest(Expressions::LessThan("no_stats", Literal::Long(5)), kRowsMightMatch,
+ data_file);
+ RunTest(Expressions::LessThanOrEqual("no_stats", Literal::Long(30)),
kRowsMightMatch,
+ data_file);
+ RunTest(Expressions::Equal("no_stats", Literal::Long(70)), kRowsMightMatch,
data_file);
+ RunTest(Expressions::GreaterThan("no_stats", Literal::Long(78)),
kRowsMightMatch,
+ data_file);
+ RunTest(Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)),
kRowsMightMatch,
+ data_file);
+ RunTest(Expressions::NotEqual("no_stats", Literal::Long(101)),
kRowsMightMatch,
+ data_file);
+ RunTest(Expressions::IsNull("no_stats"), kRowsMightMatch, data_file);
+ RunTest(Expressions::NotNull("no_stats"), kRowsMightMatch, data_file);
+ RunTest(Expressions::IsNaN("some_nans"), kRowsMightMatch, data_file);
+ RunTest(Expressions::NotNaN("some_nans"), kRowsMightMatch, data_file);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, ZeroRecordFileTest) {
+ auto data_file = std::make_shared<DataFile>();
+ data_file->file_path = "test_path";
+ data_file->file_format = FileFormatType::kParquet;
+ data_file->record_count = 0;
+
+ RunTest(Expressions::LessThan("no_stats", Literal::Long(5)), kRowCannotMatch,
+ data_file);
+ RunTest(Expressions::LessThanOrEqual("no_stats", Literal::Long(30)),
kRowCannotMatch,
+ data_file);
+ RunTest(Expressions::Equal("no_stats", Literal::Long(70)), kRowCannotMatch,
data_file);
+ RunTest(Expressions::GreaterThan("no_stats", Literal::Long(78)),
kRowCannotMatch,
+ data_file);
+ RunTest(Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)),
kRowCannotMatch,
+ data_file);
+ RunTest(Expressions::NotEqual("no_stats", Literal::Long(101)),
kRowCannotMatch,
+ data_file);
+ RunTest(Expressions::IsNull("some_nulls"), kRowCannotMatch, data_file);
+ RunTest(Expressions::NotNull("some_nulls"), kRowCannotMatch, data_file);
+ RunTest(Expressions::IsNaN("some_nans"), kRowCannotMatch, data_file);
+ RunTest(Expressions::NotNaN("some_nans"), kRowCannotMatch, data_file);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotTest) {
+ RunTest(Expressions::Not(Expressions::LessThan("id",
Literal::Long(kIntMinValue - 25))),
+ kRowsMightMatch, file1_);
+ RunTest(
+ Expressions::Not(Expressions::GreaterThan("id",
Literal::Long(kIntMinValue - 25))),
+ kRowCannotMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, AndTest) {
+ RunTest(Expressions::And(
+ Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMinValue
- 30))),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::And(
+ Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue
+ 1))),
+ kRowCannotMatch, file1_);
+ RunTest(
+ Expressions::And(Expressions::GreaterThan("id",
Literal::Long(kIntMinValue - 25)),
+ Expressions::LessThanOrEqual("id",
Literal::Long(kIntMaxValue))),
+ kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, OrTest) {
+ RunTest(Expressions::Or(
+ Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue
+ 1))),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::Or(
+ Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue
- 19))),
+ kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerLtTest) {
+ RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue)),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue + 1)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::LessThan("id", Literal::Long(kIntMaxValue)),
kRowsMightMatch,
+ file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerLtEqTest) {
+ RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue - 25)),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue - 1)),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue)),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMaxValue)),
+ kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerGtTest) {
+ RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue + 6)),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue)),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue - 1)),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue - 4)),
+ kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerGtEqTest) {
+ RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue +
6)),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue +
1)),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue)),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue -
4)),
+ kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerEqTest) {
+ RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue - 25)),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue - 1)),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue)),
kRowsMightMatch, file1_);
+ RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue - 4)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue)),
kRowsMightMatch, file1_);
+ RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue + 1)),
kRowCannotMatch,
+ file1_);
+ RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue + 6)),
kRowCannotMatch,
+ file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotEqTest) {
+ RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 25)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 1)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue - 4)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 1)),
kRowsMightMatch,
+ file1_);
+ RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 6)),
kRowsMightMatch,
+ file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotEqRewrittenTest) {
+ RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue
- 25))),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue
- 1))),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::Not(Expressions::Equal("id",
Literal::Long(kIntMinValue))),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue
- 4))),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::Not(Expressions::Equal("id",
Literal::Long(kIntMaxValue))),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue
+ 1))),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue
+ 6))),
+ kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest,
CaseInsensitiveIntegerNotEqRewrittenTest) {
+ RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue
- 25))),
+ kRowsMightMatch, file1_, false);
+ RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue
- 1))),
+ kRowsMightMatch, file1_, false);
+ RunTest(Expressions::Not(Expressions::Equal("ID",
Literal::Long(kIntMinValue))),
+ kRowsMightMatch, file1_, false);
+ RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue
- 4))),
+ kRowsMightMatch, file1_, false);
+ RunTest(Expressions::Not(Expressions::Equal("ID",
Literal::Long(kIntMaxValue))),
+ kRowsMightMatch, file1_, false);
+ RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue
+ 1))),
+ kRowsMightMatch, file1_, false);
+ RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue
+ 6))),
+ kRowsMightMatch, file1_, false);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest,
CaseSensitiveIntegerNotEqRewrittenTest) {
+ auto expr = Expressions::Not(Expressions::Equal("ID", Literal::Long(5)));
+ auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true);
+ ASSERT_FALSE(result.has_value()) << result.error().message;
+ ASSERT_TRUE(result.error().message.contains("Cannot find field 'ID' in
struct"))
+ << result.error().message;
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, StringStartsWithTest) {
+ RunTest(Expressions::StartsWith("required", "a"), kRowsMightMatch, file1_);
+ RunTest(Expressions::StartsWith("required", "a"), kRowsMightMatch, file2_);
+ RunTest(Expressions::StartsWith("required", "aa"), kRowsMightMatch, file2_);
+ RunTest(Expressions::StartsWith("required", "aaa"), kRowsMightMatch, file2_);
+ RunTest(Expressions::StartsWith("required", "1s"), kRowsMightMatch, file3_);
+ RunTest(Expressions::StartsWith("required", "1str1x"), kRowsMightMatch,
file3_);
+ RunTest(Expressions::StartsWith("required", "ff"), kRowsMightMatch, file4_);
+
+ RunTest(Expressions::StartsWith("required", "aB"), kRowCannotMatch, file2_);
+ RunTest(Expressions::StartsWith("required", "dWX"), kRowCannotMatch, file2_);
+
+ RunTest(Expressions::StartsWith("required", "5"), kRowCannotMatch, file3_);
+ RunTest(Expressions::StartsWith("required", "3str3x"), kRowCannotMatch,
file3_);
+ RunTest(Expressions::StartsWith("some_empty", "房东整租霍"), kRowsMightMatch,
file1_);
+
+ RunTest(Expressions::StartsWith("all_nulls", ""), kRowCannotMatch, file1_);
+ auto above_max = TruncateUtils::TruncateLiteral(Literal::String("イロハニホヘト"),
4)
+ .value()
+ .ToString();
+ RunTest(Expressions::StartsWith("required", above_max), kRowCannotMatch,
file4_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, StringNotStartsWithTest) {
+ RunTest(Expressions::NotStartsWith("required", "a"), kRowsMightMatch,
file1_);
+ RunTest(Expressions::NotStartsWith("required", "a"), kRowsMightMatch,
file2_);
+ RunTest(Expressions::NotStartsWith("required", "aa"), kRowsMightMatch,
file2_);
+ RunTest(Expressions::NotStartsWith("required", "aaa"), kRowsMightMatch,
file2_);
+ RunTest(Expressions::NotStartsWith("required", "1s"), kRowsMightMatch,
file3_);
+ RunTest(Expressions::NotStartsWith("required", "1str1x"), kRowsMightMatch,
file3_);
+ RunTest(Expressions::NotStartsWith("required", "ff"), kRowsMightMatch,
file4_);
+
+ RunTest(Expressions::NotStartsWith("required", "aB"), kRowsMightMatch,
file2_);
+ RunTest(Expressions::NotStartsWith("required", "dWX"), kRowsMightMatch,
file2_);
+
+ RunTest(Expressions::NotStartsWith("required", "5"), kRowsMightMatch,
file3_);
+ RunTest(Expressions::NotStartsWith("required", "3str3x"), kRowsMightMatch,
file3_);
+
+ auto above_max = TruncateUtils::TruncateLiteral(Literal::String("イロハニホヘト"),
4)
+ .value()
+ .ToString();
+ RunTest(Expressions::NotStartsWith("required", above_max), kRowsMightMatch,
file4_);
+
+ RunTest(Expressions::NotStartsWith("required", "abc"), kRowCannotMatch,
file5_);
+ RunTest(Expressions::NotStartsWith("required", "abcd"), kRowsMightMatch,
file5_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerInTest) {
+ RunTest(Expressions::In(
+ "id", {Literal::Long(kIntMinValue - 25),
Literal::Long(kIntMinValue - 24)}),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::In(
+ "id", {Literal::Long(kIntMinValue - 2),
Literal::Long(kIntMinValue - 1)}),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::In("id",
+ {Literal::Long(kIntMinValue - 1),
Literal::Long(kIntMinValue)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::In(
+ "id", {Literal::Long(kIntMaxValue - 4),
Literal::Long(kIntMaxValue - 3)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::In("id",
+ {Literal::Long(kIntMaxValue),
Literal::Long(kIntMaxValue + 1)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::In(
+ "id", {Literal::Long(kIntMaxValue + 1),
Literal::Long(kIntMaxValue + 2)}),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::In(
+ "id", {Literal::Long(kIntMaxValue + 6),
Literal::Long(kIntMaxValue + 7)}),
+ kRowCannotMatch, file1_);
+
+ RunTest(Expressions::In("all_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ kRowCannotMatch, file1_);
+ RunTest(Expressions::In("some_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::In("no_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ kRowsMightMatch, file1_);
+
+ std::vector<Literal> ids;
+ for (int i = -400; i <= 0; i++) {
+ ids.emplace_back(Literal::Long(i));
+ }
+ RunTest(Expressions::In("id", ids), kRowsMightMatch, file1_);
+}
+
+TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotInTest) {
+ RunTest(Expressions::NotIn(
+ "id", {Literal::Long(kIntMinValue - 25),
Literal::Long(kIntMinValue - 24)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::NotIn(
+ "id", {Literal::Long(kIntMinValue - 2),
Literal::Long(kIntMinValue - 1)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::NotIn(
+ "id", {Literal::Long(kIntMinValue - 1),
Literal::Long(kIntMinValue)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::NotIn(
+ "id", {Literal::Long(kIntMaxValue - 4),
Literal::Long(kIntMaxValue - 3)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::NotIn(
+ "id", {Literal::Long(kIntMaxValue), Literal::Long(kIntMaxValue +
1)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::NotIn(
+ "id", {Literal::Long(kIntMaxValue + 1),
Literal::Long(kIntMaxValue + 2)}),
+ kRowsMightMatch, file1_);
+ RunTest(Expressions::NotIn(
+ "id", {Literal::Long(kIntMaxValue + 6),
Literal::Long(kIntMaxValue + 7)}),
+ kRowsMightMatch, file1_);
+
+ RunTest(
+ Expressions::NotIn("all_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ kRowsMightMatch, file1_);
+ RunTest(
+ Expressions::NotIn("some_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ kRowsMightMatch, file1_);
+ RunTest(
+ Expressions::NotIn("no_nulls", {Literal::String("abc"),
Literal::String("def")}),
+ kRowsMightMatch, file1_);
+
+ std::vector<Literal> ids;
+ for (int i = -400; i <= 0; i++) {
+ ids.emplace_back(Literal::Long(i));
+ }
+ RunTest(Expressions::NotIn("id", ids), kRowsMightMatch, file1_);
+}
+
+} // namespace iceberg
diff --git
a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc
b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc
new file mode 100644
index 00000000..935f3c3a
--- /dev/null
+++ b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc
@@ -0,0 +1,485 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "iceberg/expression/expressions.h"
+#include "iceberg/expression/inclusive_metrics_evaluator.h"
+#include "iceberg/expression/term.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/schema.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
+
+namespace iceberg {
+
+namespace {
+constexpr bool kRowsMightMatch = true;
+constexpr bool kRowCannotMatch = false;
+constexpr int64_t kIntMinValue = 30;
+constexpr int64_t kIntMaxValue = 79;
+constexpr int64_t kMicrosPerDay = 86'400'000'000LL;
+constexpr int64_t kTsMinValue = 30 * kMicrosPerDay;
+constexpr int64_t kTsMaxValue = 79 * kMicrosPerDay;
+
+std::shared_ptr<UnboundTerm<BoundTransform>> ToBoundTransform(
+ const std::shared_ptr<UnboundTransform>& transform) {
+ return std::static_pointer_cast<UnboundTerm<BoundTransform>>(transform);
+}
+} // namespace
+
+class InclusiveMetricsEvaluatorWithTransformTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ schema_ = std::make_shared<Schema>(
+ std::vector<SchemaField>{
+ SchemaField::MakeRequired(1, "id", int64()),
+ SchemaField::MakeRequired(2, "ts", timestamp_tz()),
+ SchemaField::MakeOptional(3, "all_nulls", int64()),
+ SchemaField::MakeOptional(4, "all_nulls_str", string()),
+ SchemaField::MakeOptional(5, "no_stats", int64()),
+ SchemaField::MakeOptional(6, "str", string()),
+ },
+ /*schema_id=*/0);
+
+ data_file_ = std::make_shared<DataFile>();
+ data_file_->file_path = "file.avro";
+ data_file_->file_format = FileFormatType::kAvro;
+ data_file_->record_count = 50;
+ data_file_->value_counts = {
+ {1, 50L},
+ {2, 50L},
+ {3, 50L},
+ {4, 50L},
+ };
+ data_file_->null_value_counts = {
+ {1, 0L},
+ {2, 0L},
+ {3, 50L},
+ {4, 50L},
+ };
+ data_file_->nan_value_counts.clear();
+ data_file_->lower_bounds = {
+ {2, Literal::TimestampTz(kTsMinValue).Serialize().value()},
+ {6, Literal::String("abc").Serialize().value()},
+ };
+ data_file_->upper_bounds = {
+ {2, Literal::TimestampTz(kTsMaxValue).Serialize().value()},
+ {6, Literal::String("abe").Serialize().value()},
+ };
+ }
+
+ void ExpectShouldRead(const std::shared_ptr<Expression>& expr, bool
expected_result,
+ std::shared_ptr<DataFile> file = nullptr,
+ bool case_sensitive = true) {
+ auto target_file = file ? file : data_file_;
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_,
case_sensitive));
+ auto eval_result = evaluator->Evaluate(*target_file);
+ ASSERT_TRUE(eval_result.has_value());
+ ASSERT_EQ(eval_result.value(), expected_result) << expr->ToString();
+ }
+
+ std::vector<std::shared_ptr<Expression>> MissingStatsExpressions() const {
+ auto truncate_no_stats =
ToBoundTransform(Expressions::Truncate("no_stats", 10));
+ return {
+ Expressions::LessThan(truncate_no_stats, Literal::Long(5)),
+ Expressions::LessThanOrEqual(truncate_no_stats, Literal::Long(30)),
+ Expressions::Equal(truncate_no_stats, Literal::Long(70)),
+ Expressions::GreaterThan(truncate_no_stats, Literal::Long(78)),
+ Expressions::GreaterThanOrEqual(truncate_no_stats, Literal::Long(90)),
+ Expressions::NotEqual(truncate_no_stats, Literal::Long(101)),
+ Expressions::IsNull(truncate_no_stats),
+ Expressions::NotNull(truncate_no_stats),
+ };
+ }
+
+ std::shared_ptr<Schema> schema_;
+ std::shared_ptr<DataFile> data_file_;
+};
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest,
AllNullsWithNonOrderPreserving) {
+ auto bucket_all_nulls = ToBoundTransform(Expressions::Bucket("all_nulls",
100));
+ ExpectShouldRead(Expressions::IsNull(bucket_all_nulls), kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotNull(bucket_all_nulls), kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThan(bucket_all_nulls, Literal::Int(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThanOrEqual(bucket_all_nulls,
Literal::Int(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::GreaterThan(bucket_all_nulls,
Literal::Int(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual(bucket_all_nulls,
Literal::Int(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::Equal(bucket_all_nulls, Literal::Int(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::NotEqual(bucket_all_nulls, Literal::Int(30)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::In(bucket_all_nulls, {Literal::Int(1),
Literal::Int(2)}),
+ kRowCannotMatch);
+ ExpectShouldRead(
+ Expressions::NotIn(bucket_all_nulls, {Literal::Int(1), Literal::Int(2)}),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest,
RequiredWithNonOrderPreserving) {
+ auto bucket_ts = ToBoundTransform(Expressions::Bucket("ts", 100));
+ ExpectShouldRead(Expressions::IsNull(bucket_ts), kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotNull(bucket_ts), kRowsMightMatch);
+ ExpectShouldRead(Expressions::LessThan(bucket_ts, Literal::Int(30)),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::LessThanOrEqual(bucket_ts, Literal::Int(30)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::GreaterThan(bucket_ts, Literal::Int(30)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual(bucket_ts,
Literal::Int(30)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::Equal(bucket_ts, Literal::Int(30)),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotEqual(bucket_ts, Literal::Int(30)),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::In(bucket_ts, {Literal::Int(1),
Literal::Int(2)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotIn(bucket_ts, {Literal::Int(1),
Literal::Int(2)}),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, AllNulls) {
+ auto truncate_all_nulls =
ToBoundTransform(Expressions::Truncate("all_nulls", 10));
+ ExpectShouldRead(Expressions::IsNull(truncate_all_nulls), kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotNull(truncate_all_nulls), kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThan(truncate_all_nulls,
Literal::Long(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThanOrEqual(truncate_all_nulls,
Literal::Long(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::GreaterThan(truncate_all_nulls,
Literal::Long(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual(truncate_all_nulls,
Literal::Long(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::Equal(truncate_all_nulls, Literal::Long(30)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::NotEqual(truncate_all_nulls,
Literal::Long(30)),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::In(truncate_all_nulls, {Literal::Long(10),
Literal::Long(20)}),
+ kRowCannotMatch);
+ ExpectShouldRead(
+ Expressions::NotIn(truncate_all_nulls, {Literal::Long(10),
Literal::Long(20)}),
+ kRowsMightMatch);
+
+ auto truncate_all_nulls_str =
+ ToBoundTransform(Expressions::Truncate("all_nulls_str", 10));
+ ExpectShouldRead(Expressions::StartsWith(truncate_all_nulls_str, "a"),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotStartsWith(truncate_all_nulls_str, "a"),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, MissingColumn) {
+ auto expr = Expressions::LessThan(
+ ToBoundTransform(Expressions::Truncate("missing", 10)),
Literal::Long(20));
+ auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true);
+ ASSERT_FALSE(result.has_value()) << result.error().message;
+ ASSERT_TRUE(result.error().message.contains("Cannot find field 'missing'"))
+ << result.error().message;
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, MissingStats) {
+ for (const auto& expr : MissingStatsExpressions()) {
+ ExpectShouldRead(expr, kRowsMightMatch);
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, ZeroRecordFile) {
+ auto zero_record_file = std::make_shared<DataFile>();
+ zero_record_file->file_path = "file.parquet";
+ zero_record_file->file_format = FileFormatType::kParquet;
+ zero_record_file->record_count = 0;
+
+ for (const auto& expr : MissingStatsExpressions()) {
+ ExpectShouldRead(expr, kRowCannotMatch, zero_record_file);
+ }
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, Not) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(
+ Expressions::Not(Expressions::LessThan(day_ts,
Literal::Long(kIntMinValue - 25))),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::Not(Expressions::GreaterThan(
+ day_ts, Literal::Long(kIntMinValue - 25))),
+ kRowCannotMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, And) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(
+ Expressions::And(
+ Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMinValue -
30))),
+ kRowCannotMatch);
+ ExpectShouldRead(
+ Expressions::And(
+ Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue +
1))),
+ kRowCannotMatch);
+ ExpectShouldRead(
+ Expressions::And(Expressions::GreaterThan(day_ts,
Literal::Long(kIntMinValue - 25)),
+ Expressions::LessThanOrEqual(day_ts,
Literal::Long(kIntMinValue))),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, Or) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(
+ Expressions::Or(
+ Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue +
1))),
+ kRowCannotMatch);
+ ExpectShouldRead(
+ Expressions::Or(
+ Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)),
+ Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue -
19))),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerLt) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue -
25)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue +
1)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMaxValue)),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerLtEq) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(Expressions::LessThanOrEqual(day_ts,
Literal::Long(kIntMinValue - 25)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThanOrEqual(day_ts,
Literal::Long(kIntMinValue - 1)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::LessThanOrEqual(day_ts,
Literal::Long(kIntMinValue)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::LessThanOrEqual(day_ts,
Literal::Long(kIntMaxValue)),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerGt) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Int(kIntMaxValue
+ 6)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::GreaterThan(day_ts,
Literal::Date(kIntMaxValue)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue
- 1)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue
- 4)),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerGtEq) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(
+ Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 6)),
+ kRowCannotMatch);
+ ExpectShouldRead(
+ Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 1)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::GreaterThanOrEqual(day_ts,
Literal::Long(kIntMaxValue)),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue - 4)),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerEq) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue -
25)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 1)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue - 4)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 1)),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 6)),
+ kRowCannotMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotEq) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue -
25)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue -
1)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue -
4)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue +
1)),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue +
6)),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotEqRewritten) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue -
25))),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue -
1))),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts,
Literal::Long(kIntMinValue))),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue -
4))),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts,
Literal::Long(kIntMaxValue))),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue +
1))),
+ kRowsMightMatch);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue +
6))),
+ kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest,
CaseInsensitiveIntegerNotEqRewritten) {
+ auto day_ts = ToBoundTransform(Expressions::Day("TS"));
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue -
25))),
+ kRowsMightMatch, nullptr, false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue -
1))),
+ kRowsMightMatch, nullptr, false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts,
Literal::Long(kIntMinValue))),
+ kRowsMightMatch, nullptr, false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue -
4))),
+ kRowsMightMatch, nullptr, false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts,
Literal::Long(kIntMaxValue))),
+ kRowsMightMatch, nullptr, false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue +
1))),
+ kRowsMightMatch, nullptr, false);
+ ExpectShouldRead(
+ Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue +
6))),
+ kRowsMightMatch, nullptr, false);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest,
CaseSensitiveIntegerNotEqRewritten) {
+ auto day_ts = ToBoundTransform(Expressions::Day("TS"));
+ auto expr = Expressions::Not(Expressions::Equal(day_ts, Literal::Long(5)));
+ auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true);
+ ASSERT_FALSE(result.has_value()) << result.error().message;
+ ASSERT_TRUE(result.error().message.contains("Cannot find field 'TS'"))
+ << result.error().message;
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, StringStartsWith) {
+ auto truncate_str = ToBoundTransform(Expressions::Truncate("str", 10));
+ ExpectShouldRead(Expressions::StartsWith(truncate_str, "a"),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::StartsWith(truncate_str, "ab"),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::StartsWith(truncate_str, "b"),
kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, StringNotStartsWith) {
+ auto truncate_str = ToBoundTransform(Expressions::Truncate("str", 10));
+ ExpectShouldRead(Expressions::StartsWith(truncate_str, "a"),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::StartsWith(truncate_str, "ab"),
kRowsMightMatch);
+ ExpectShouldRead(Expressions::StartsWith(truncate_str, "b"),
kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerIn) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 25),
+ Literal::Long(kIntMinValue - 24)}),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 2),
+ Literal::Long(kIntMinValue - 1)}),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 1),
+ Literal::Long(kIntMinValue)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue - 4),
+ Literal::Long(kIntMaxValue - 3)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue),
+ Literal::Long(kIntMaxValue + 1)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue + 1),
+ Literal::Long(kIntMaxValue + 2)}),
+ kRowCannotMatch);
+ ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue + 6),
+ Literal::Long(kIntMaxValue + 7)}),
+ kRowCannotMatch);
+
+ std::vector<Literal> ids;
+ ids.reserve(401);
+ for (int i = -400; i <= 0; ++i) {
+ ids.emplace_back(Literal::Long(i));
+ }
+ ExpectShouldRead(Expressions::In(day_ts, ids), kRowsMightMatch);
+}
+
+TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotIn) {
+ auto day_ts = ToBoundTransform(Expressions::Day("ts"));
+ ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue -
25),
+ Literal::Long(kIntMinValue -
24)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 2),
+ Literal::Long(kIntMinValue -
1)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 1),
+ Literal::Long(kIntMinValue)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue - 4),
+ Literal::Long(kIntMaxValue -
3)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue),
+ Literal::Long(kIntMaxValue +
1)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue + 1),
+ Literal::Long(kIntMaxValue +
2)}),
+ kRowsMightMatch);
+ ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue + 6),
+ Literal::Long(kIntMaxValue +
7)}),
+ kRowsMightMatch);
+
+ std::vector<Literal> ids;
+ ids.reserve(401);
+ for (int i = -400; i <= 0; ++i) {
+ ids.emplace_back(Literal::Long(i));
+ }
+ ExpectShouldRead(Expressions::NotIn(day_ts, ids), kRowsMightMatch);
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index a6a23238..1caeea2a 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -59,6 +59,8 @@ iceberg_tests = {
'aggregate_test.cc',
'expression_test.cc',
'expression_visitor_test.cc',
+ 'inclusive_metrics_evaluator_test.cc',
+ 'inclusive_metrics_evaluator_with_transform_test.cc',
'literal_test.cc',
'predicate_test.cc',
),