This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 09f26b66 feat: add strict metrics evaluator (#383)
09f26b66 is described below

commit 09f26b664e7c04628f8e6fdd272eedcc99ae107a
Author: dongxiao <[email protected]>
AuthorDate: Fri Dec 5 11:39:45 2025 +0800

    feat: add strict metrics evaluator (#383)
---
 src/iceberg/CMakeLists.txt                         |   1 +
 src/iceberg/catalog/rest/meson.build               |   1 -
 src/iceberg/expression/meson.build                 |   4 +
 src/iceberg/expression/strict_metrics_evaluator.cc | 506 ++++++++++++
 src/iceberg/expression/strict_metrics_evaluator.h  |  79 ++
 src/iceberg/meson.build                            |   1 +
 src/iceberg/test/CMakeLists.txt                    |   3 +-
 src/iceberg/test/meson.build                       |   1 +
 src/iceberg/test/strict_metrics_evaluator_test.cc  | 849 +++++++++++++++++++++
 9 files changed, 1443 insertions(+), 2 deletions(-)

diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 369666b7..275d71fc 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -29,6 +29,7 @@ set(ICEBERG_SOURCES
     expression/literal.cc
     expression/predicate.cc
     expression/rewrite_not.cc
+    expression/strict_metrics_evaluator.cc
     expression/term.cc
     file_reader.cc
     file_writer.cc
diff --git a/src/iceberg/catalog/rest/meson.build 
b/src/iceberg/catalog/rest/meson.build
index 89a68850..8378b2a8 100644
--- a/src/iceberg/catalog/rest/meson.build
+++ b/src/iceberg/catalog/rest/meson.build
@@ -61,7 +61,6 @@ install_headers(
         'error_handlers.h',
         'http_client.h',
         'iceberg_rest_export.h',
-        'json_internal.h',
         'resource_paths.h',
         'rest_catalog.h',
         'rest_util.h',
diff --git a/src/iceberg/expression/meson.build 
b/src/iceberg/expression/meson.build
index 83005908..8e312791 100644
--- a/src/iceberg/expression/meson.build
+++ b/src/iceberg/expression/meson.build
@@ -17,13 +17,17 @@
 
 install_headers(
     [
+        'aggregate.h',
         'binder.h',
+        'evaluator.h',
         'expression.h',
         'expression_visitor.h',
         'expressions.h',
+        'inclusive_metrics_evaluator.h',
         'literal.h',
         'predicate.h',
         'rewrite_not.h',
+        'strict_metrics_evaluator.h',
         'term.h',
     ],
     subdir: 'iceberg/expression',
diff --git a/src/iceberg/expression/strict_metrics_evaluator.cc 
b/src/iceberg/expression/strict_metrics_evaluator.cc
new file mode 100644
index 00000000..e2fe34f1
--- /dev/null
+++ b/src/iceberg/expression/strict_metrics_evaluator.cc
@@ -0,0 +1,506 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/expression/strict_metrics_evaluator.h"
+
+#include "iceberg/expression/binder.h"
+#include "iceberg/expression/expression_visitor.h"
+#include "iceberg/expression/rewrite_not.h"
+#include "iceberg/expression/term.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/schema.h"
+#include "iceberg/type.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+namespace {
+constexpr bool kRowsMustMatch = true;
+constexpr bool kRowsMightNotMatch = false;
+}  // namespace
+
+// If the term in any expression is not a direct reference, assume that rows 
may not
+// match. This happens when transforms or other expressions are passed to this 
evaluator.
+// For example, bucket16(x) = 0 can't be determined because this visitor 
operates on data
+// metrics and not partition values. It may be possible to un-transform 
expressions for
+// order preserving transforms in the future, but this is not currently 
supported.
+#define RETURN_IF_NOT_REFERENCE(expr)                                         \
+  if (auto ref = dynamic_cast<BoundReference*>(expr.get()); ref == nullptr) { \
+    return kRowsMightNotMatch;                                                \
+  }
+
+class StrictMetricsVisitor : public BoundVisitor<bool> {
+ public:
+  explicit StrictMetricsVisitor(const DataFile& data_file, const Schema& 
schema)
+      : data_file_(data_file), schema_(schema) {}
+
+  Result<bool> AlwaysTrue() override { return kRowsMustMatch; }
+
+  Result<bool> AlwaysFalse() override { return kRowsMightNotMatch; }
+
+  Result<bool> Not(bool child_result) override { return !child_result; }
+
+  Result<bool> And(bool left_result, bool right_result) override {
+    return left_result && right_result;
+  }
+
+  Result<bool> Or(bool left_result, bool right_result) override {
+    return left_result || right_result;
+  }
+
+  Result<bool> IsNull(const std::shared_ptr<Bound>& expr) override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // no need to check whether the field is required because binding 
evaluates that case
+    // if the column has any non-null values, the expression does not match
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (ContainsNullsOnly(id)) {
+      return kRowsMustMatch;
+    }
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> NotNull(const std::shared_ptr<Bound>& expr) override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // no need to check whether the field is required because binding 
evaluates that case
+    // if the column has any null values, the expression does not match
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    auto it = data_file_.null_value_counts.find(id);
+    if (it != data_file_.null_value_counts.cend() && it->second == 0) {
+      return kRowsMustMatch;
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> IsNaN(const std::shared_ptr<Bound>& expr) override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    int32_t id = expr->reference()->field().field_id();
+
+    if (ContainsNaNsOnly(id)) {
+      return kRowsMustMatch;
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> NotNaN(const std::shared_ptr<Bound>& expr) override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    int32_t id = expr->reference()->field().field_id();
+
+    auto it = data_file_.nan_value_counts.find(id);
+    if (it != data_file_.nan_value_counts.cend() && it->second == 0) {
+      return kRowsMustMatch;
+    }
+
+    if (ContainsNullsOnly(id)) {
+      return kRowsMustMatch;
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> Lt(const std::shared_ptr<Bound>& expr, const Literal& lit) 
override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // Rows must match when: <----------Min----Max---X------->
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (CanContainNulls(id) || CanContainNaNs(id)) {
+      return kRowsMightNotMatch;
+    }
+
+    auto it = data_file_.upper_bounds.find(id);
+    if (it != data_file_.upper_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, it->second));
+      if (upper < lit) {
+        return kRowsMustMatch;
+      }
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> LtEq(const std::shared_ptr<Bound>& expr, const Literal& lit) 
override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // Rows must match when: <----------Min----Max---X------->
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (CanContainNulls(id) || CanContainNaNs(id)) {
+      return kRowsMightNotMatch;
+    }
+
+    auto it = data_file_.upper_bounds.find(id);
+    if (it != data_file_.upper_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, it->second));
+      if (upper <= lit) {
+        return kRowsMustMatch;
+      }
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> Gt(const std::shared_ptr<Bound>& expr, const Literal& lit) 
override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // Rows must match when: <-------X---Min----Max---------->
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (CanContainNulls(id) || CanContainNaNs(id)) {
+      return kRowsMightNotMatch;
+    }
+
+    auto it = data_file_.lower_bounds.find(id);
+    if (it != data_file_.lower_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, it->second));
+      if (lower.IsNaN()) {
+        // NaN indicates unreliable bounds. See the StrictMetricsEvaluator 
docs for
+        // more.
+        return kRowsMightNotMatch;
+      }
+
+      if (lower > lit) {
+        return kRowsMustMatch;
+      }
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> GtEq(const std::shared_ptr<Bound>& expr, const Literal& lit) 
override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // Rows must match when: <-------X---Min----Max---------->
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (CanContainNulls(id) || CanContainNaNs(id)) {
+      return kRowsMightNotMatch;
+    }
+
+    auto it = data_file_.lower_bounds.find(id);
+    if (it != data_file_.lower_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, it->second));
+      if (lower.IsNaN()) {
+        // NaN indicates unreliable bounds. See the StrictMetricsEvaluator 
docs for
+        // more.
+        return kRowsMightNotMatch;
+      }
+
+      if (lower >= lit) {
+        return kRowsMustMatch;
+      }
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> Eq(const std::shared_ptr<Bound>& expr, const Literal& lit) 
override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // Rows must match when Min == X == Max
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (CanContainNulls(id) || CanContainNaNs(id)) {
+      return kRowsMightNotMatch;
+    }
+    auto lower_it = data_file_.lower_bounds.find(id);
+    auto upper_it = data_file_.upper_bounds.find(id);
+    if (lower_it != data_file_.lower_bounds.cend() &&
+        upper_it != data_file_.upper_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+      if (lower != lit) {
+        return kRowsMightNotMatch;
+      }
+      ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+      if (upper != lit) {
+        return kRowsMightNotMatch;
+      }
+
+      return kRowsMustMatch;
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> NotEq(const std::shared_ptr<Bound>& expr, const Literal& lit) 
override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    // Rows must match when X < Min or Max < X because it is not in the range
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+      return kRowsMustMatch;
+    }
+
+    auto lower_it = data_file_.lower_bounds.find(id);
+    if (lower_it != data_file_.lower_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+      if (lower.IsNaN()) {
+        // NaN indicates unreliable bounds. See the StrictMetricsEvaluator 
docs for
+        // more.
+        return kRowsMightNotMatch;
+      }
+      if (lower > lit) {
+        return kRowsMustMatch;
+      }
+    }
+
+    auto upper_it = data_file_.upper_bounds.find(id);
+    if (upper_it != data_file_.upper_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+      if (upper < lit) {
+        return kRowsMustMatch;
+      }
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> In(const std::shared_ptr<Bound>& expr,
+                  const BoundSetPredicate::LiteralSet& literal_set) override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (CanContainNulls(id) || CanContainNaNs(id)) {
+      return kRowsMightNotMatch;
+    }
+    auto lower_it = data_file_.lower_bounds.find(id);
+    auto upper_it = data_file_.upper_bounds.find(id);
+    if (lower_it != data_file_.lower_bounds.cend() &&
+        upper_it != data_file_.upper_bounds.cend()) {
+      // similar to the implementation in eq, first check if the lower bound 
is in the
+      // set
+      ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+      if (!literal_set.contains(lower)) {
+        return kRowsMightNotMatch;
+      }
+      // check if the upper bound is in the set
+      ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+      if (!literal_set.contains(upper)) {
+        return kRowsMightNotMatch;
+      }
+      // finally check if the lower bound and the upper bound are equal
+      if (lower != upper) {
+        return kRowsMightNotMatch;
+      }
+
+      // All values must be in the set if the lower bound and the upper bound 
are in the
+      // set and are equal.
+      return kRowsMustMatch;
+    }
+
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> NotIn(const std::shared_ptr<Bound>& expr,
+                     const BoundSetPredicate::LiteralSet& literal_set) 
override {
+    RETURN_IF_NOT_REFERENCE(expr);
+
+    int32_t id = expr->reference()->field().field_id();
+
+    ICEBERG_ASSIGN_OR_RAISE(auto is_nested, IsNestedColumn(id));
+    if (is_nested) {
+      return kRowsMightNotMatch;
+    }
+
+    if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) {
+      return kRowsMustMatch;
+    }
+    std::optional<Literal> lower_bound;
+    auto lower_it = data_file_.lower_bounds.find(id);
+    if (lower_it != data_file_.lower_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseBound(expr, lower_it->second));
+      if (lower.IsNaN()) {
+        // NaN indicates unreliable bounds. See the StrictMetricsEvaluator 
docs for
+        // more.
+        return kRowsMightNotMatch;
+      }
+      lower_bound = std::move(lower);
+    }
+    auto literals_view = literal_set | std::views::filter([&](const Literal& 
lit) {
+                           return lower_bound.has_value() && 
lower_bound.value() <= lit;
+                         });
+    // if all values are less than lower bound, rows must
+    // match (notIn).
+    if (lower_bound.has_value() && literals_view.empty()) {
+      return kRowsMustMatch;
+    }
+
+    auto upper_it = data_file_.upper_bounds.find(id);
+    if (upper_it != data_file_.upper_bounds.cend()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseBound(expr, upper_it->second));
+      auto filtered_view = literals_view | std::views::filter([&](const 
Literal& lit) {
+                             return upper >= lit;
+                           });
+      if (filtered_view.empty()) {
+        // if all remaining values are greater than upper bound,
+        // rows must match
+        // (notIn).
+        return kRowsMustMatch;
+      }
+    }
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> StartsWith(const std::shared_ptr<Bound>& expr,
+                          const Literal& lit) override {
+    return kRowsMightNotMatch;
+  }
+
+  Result<bool> NotStartsWith(const std::shared_ptr<Bound>& expr,
+                             const Literal& lit) override {
+    // TODO(xiao.dong) Handle cases that definitely cannot match,
+    // such as notStartsWith("x") when
+    // the bounds are ["a", "b"].
+    return kRowsMightNotMatch;
+  }
+
+ private:
+  Result<Literal> ParseBound(const std::shared_ptr<Bound>& expr,
+                             const std::vector<uint8_t>& stats) {
+    auto type = expr->reference()->type();
+    if (!type->is_primitive()) {
+      return NotSupported("Bound of non-primitive type is not supported.");
+    }
+    auto primitive_type = internal::checked_pointer_cast<PrimitiveType>(type);
+    return Literal::Deserialize(stats, primitive_type);
+  }
+
+  bool CanContainNulls(int32_t id) {
+    if (data_file_.null_value_counts.empty()) {
+      return true;
+    }
+    auto it = data_file_.null_value_counts.find(id);
+    return it != data_file_.null_value_counts.cend() && it->second > 0;
+  }
+
+  bool CanContainNaNs(int32_t id) {
+    // nan counts might be null for early version writers when nan counters 
are not
+    // populated.
+    auto it = data_file_.nan_value_counts.find(id);
+    return it != data_file_.nan_value_counts.cend() && it->second > 0;
+  }
+
+  bool ContainsNullsOnly(int32_t id) {
+    auto val_it = data_file_.value_counts.find(id);
+    auto null_it = data_file_.null_value_counts.find(id);
+    return val_it != data_file_.value_counts.cend() &&
+           null_it != data_file_.null_value_counts.cend() &&
+           val_it->second == null_it->second;
+  }
+
+  bool ContainsNaNsOnly(int32_t id) {
+    auto val_it = data_file_.value_counts.find(id);
+    auto nan_it = data_file_.nan_value_counts.find(id);
+    return val_it != data_file_.value_counts.cend() &&
+           nan_it != data_file_.nan_value_counts.cend() &&
+           val_it->second == nan_it->second;
+  }
+
+  Result<bool> IsNestedColumn(int32_t id) {
+    // XXX: null_count might be missing from nested columns but required  by
+    // StrictMetricsEvaluator.
+    // See https://github.com/apache/iceberg/pull/11261.
+    ICEBERG_ASSIGN_OR_RAISE(auto field, schema_.GetFieldById(id));
+    return !field.has_value() || field->get().type()->is_nested();
+  }
+
+ private:
+  const DataFile& data_file_;
+  const Schema& schema_;
+};
+
+StrictMetricsEvaluator::StrictMetricsEvaluator(std::shared_ptr<Expression> 
expr,
+                                               std::shared_ptr<Schema> schema)
+    : expr_(std::move(expr)), schema_(std::move(schema)) {}
+
+StrictMetricsEvaluator::~StrictMetricsEvaluator() = default;
+
+Result<std::unique_ptr<StrictMetricsEvaluator>> StrictMetricsEvaluator::Make(
+    std::shared_ptr<Expression> expr, std::shared_ptr<Schema> schema,
+    bool case_sensitive) {
+  ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr, 
RewriteNot::Visit(std::move(expr)));
+  ICEBERG_ASSIGN_OR_RAISE(auto bound_expr,
+                          Binder::Bind(*schema, rewrite_expr, case_sensitive));
+  return std::unique_ptr<StrictMetricsEvaluator>(
+      new StrictMetricsEvaluator(std::move(bound_expr), std::move(schema)));
+}
+
+Result<bool> StrictMetricsEvaluator::Evaluate(const DataFile& data_file) const 
{
+  if (data_file.record_count <= 0) {
+    return kRowsMustMatch;
+  }
+  StrictMetricsVisitor visitor(data_file, *schema_);
+  return Visit<bool, StrictMetricsVisitor>(expr_, visitor);
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/expression/strict_metrics_evaluator.h 
b/src/iceberg/expression/strict_metrics_evaluator.h
new file mode 100644
index 00000000..60dc74a9
--- /dev/null
+++ b/src/iceberg/expression/strict_metrics_evaluator.h
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/expression/strict_metrics_evaluator.h
+///
+/// Evaluates an Expression on a DataFile to test whether all rows in the file 
match.
+///
+/// This evaluation is strict: it returns true if all rows in a file must 
match the
+/// expression. For example, if a file's ts column has min X and max Y, this 
evaluator
+/// will return true for ts &lt; Y+1 but not for ts &lt; Y-1.
+///
+/// Files are passed to #eval(ContentFile), which returns true if all rows in 
the file
+/// must contain matching rows and false if the file may contain rows that do 
not match.
+///
+/// Due to the comparison implementation of ORC stats, for float/double 
columns in ORC
+/// files, if the first value in a file is NaN, metrics of this file will 
report NaN for
+/// both upper and lower bound despite that the column could contain non-NaN 
data. Thus in
+/// some scenarios explicitly checks for NaN is necessary in order to not 
include files
+/// that may contain rows that don't match.
+///
+
+#include <memory>
+
+#include "iceberg/expression/expression.h"
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+namespace iceberg {
+
+/// \brief Evaluates an Expression against DataFile.
+/// \note: The evaluator is thread-safe.
+class ICEBERG_EXPORT StrictMetricsEvaluator {
+ public:
+  /// \brief Make a strict metrics evaluator
+  ///
+  /// \param expr The expression to evaluate
+  /// \param schema The schema of the table
+  /// \param case_sensitive Whether field name matching is case-sensitive
+  static Result<std::unique_ptr<StrictMetricsEvaluator>> Make(
+      std::shared_ptr<Expression> expr, std::shared_ptr<Schema> schema,
+      bool case_sensitive = true);
+
+  ~StrictMetricsEvaluator();
+
+  /// \brief Evaluate the expression against a DataFile.
+  ///
+  /// \param data_file The data file to evaluate
+  /// \return true if the file matches the expression, false otherwise, or 
error
+  Result<bool> Evaluate(const DataFile& data_file) const;
+
+ private:
+  explicit StrictMetricsEvaluator(std::shared_ptr<Expression> expr,
+                                  std::shared_ptr<Schema> schema);
+
+ private:
+  std::shared_ptr<Expression> expr_;
+  std::shared_ptr<Schema> schema_;
+};
+
+}  // namespace iceberg
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index 5a993338..c139c66b 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -51,6 +51,7 @@ iceberg_sources = files(
     'expression/literal.cc',
     'expression/predicate.cc',
     'expression/rewrite_not.cc',
+    'expression/strict_metrics_evaluator.cc',
     'expression/term.cc',
     'file_reader.cc',
     'file_writer.cc',
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index a13d1f82..9892e3d4 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -87,7 +87,8 @@ add_iceberg_test(expression_test
                  literal_test.cc
                  inclusive_metrics_evaluator_test.cc
                  inclusive_metrics_evaluator_with_transform_test.cc
-                 predicate_test.cc)
+                 predicate_test.cc
+                 strict_metrics_evaluator_test.cc)
 
 add_iceberg_test(json_serde_test
                  SOURCES
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index 4cb153ba..c73abe18 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -65,6 +65,7 @@ iceberg_tests = {
             'inclusive_metrics_evaluator_with_transform_test.cc',
             'literal_test.cc',
             'predicate_test.cc',
+            'strict_metrics_evaluator_test.cc',
         ),
     },
     'json_serde_test': {
diff --git a/src/iceberg/test/strict_metrics_evaluator_test.cc 
b/src/iceberg/test/strict_metrics_evaluator_test.cc
new file mode 100644
index 00000000..fa6185c3
--- /dev/null
+++ b/src/iceberg/test/strict_metrics_evaluator_test.cc
@@ -0,0 +1,849 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/expression/strict_metrics_evaluator.h"
+
+#include <limits>
+
+#include <gtest/gtest.h>
+
+#include "iceberg/expression/binder.h"
+#include "iceberg/expression/expressions.h"
+#include "iceberg/manifest/manifest_entry.h"
+#include "iceberg/schema.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
+
+namespace iceberg {
+
+namespace {
+constexpr bool kRowsMustMatch = true;
+constexpr bool kRowsMightNotMatch = false;
+}  // namespace
+using TestVariant = std::variant<bool, int32_t, int64_t, double, std::string>;
+
+class StrictMetricsEvaluatorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    schema_ = std::make_shared<Schema>(
+        std::vector<SchemaField>{
+            SchemaField::MakeRequired(1, "id", int64()),
+            SchemaField::MakeOptional(2, "name", string()),
+            SchemaField::MakeRequired(3, "age", int32()),
+            SchemaField::MakeOptional(4, "salary", float64()),
+            SchemaField::MakeRequired(5, "active", boolean()),
+            SchemaField::MakeRequired(6, "date", string()),
+        },
+        /*schema_id=*/0);
+  }
+
+  Result<std::shared_ptr<Expression>> Bind(const std::shared_ptr<Expression>& 
expr,
+                                           bool case_sensitive = true) {
+    return Binder::Bind(*schema_, expr, case_sensitive);
+  }
+
+  std::shared_ptr<DataFile> PrepareDataFile(
+      const std::string& partition, int64_t record_count, int64_t 
file_size_in_bytes,
+      const std::map<std::string, TestVariant>& lower_bounds,
+      const std::map<std::string, TestVariant>& upper_bounds,
+      const std::map<int32_t, int64_t>& value_counts = {},
+      const std::map<int32_t, int64_t>& null_counts = {},
+      const std::map<int32_t, int64_t>& nan_counts = {}) {
+    auto parse_bound = [&](const std::map<std::string, TestVariant>& bounds,
+                           std::map<int32_t, std::vector<uint8_t>>& 
bound_values) {
+      for (const auto& [key, value] : bounds) {
+        if (key == "id") {
+          bound_values[1] = 
Literal::Long(std::get<int64_t>(value)).Serialize().value();
+        } else if (key == "name") {
+          bound_values[2] =
+              
Literal::String(std::get<std::string>(value)).Serialize().value();
+        } else if (key == "age") {
+          bound_values[3] = 
Literal::Int(std::get<int32_t>(value)).Serialize().value();
+        } else if (key == "salary") {
+          bound_values[4] = 
Literal::Double(std::get<double>(value)).Serialize().value();
+        } else if (key == "active") {
+          bound_values[5] = 
Literal::Boolean(std::get<bool>(value)).Serialize().value();
+        }
+      }
+    };
+
+    auto data_file = std::make_shared<DataFile>();
+    data_file->file_path = "test_path";
+    data_file->file_format = FileFormatType::kParquet;
+    data_file->partition.AddValue(Literal::String(partition));
+    data_file->record_count = record_count;
+    data_file->file_size_in_bytes = file_size_in_bytes;
+    data_file->column_sizes = {};
+    data_file->value_counts = value_counts;
+    data_file->null_value_counts = null_counts;
+    data_file->nan_value_counts = nan_counts;
+    data_file->split_offsets = {1};
+    data_file->sort_order_id = 0;
+    parse_bound(upper_bounds, data_file->upper_bounds);
+    parse_bound(lower_bounds, data_file->lower_bounds);
+    return data_file;
+  }
+
+  void TestCase(const std::shared_ptr<Expression>& unbound, bool 
expected_result) {
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+                                /*file_size_in_bytes=*/1024,
+                                /*lower_bounds=*/{{"id", 
static_cast<int64_t>(100)}},
+                                /*upper_bounds=*/{{"id", 
static_cast<int64_t>(200)}},
+                                /*value_counts=*/{{1, 10}}, 
/*null_counts=*/{{1, 0}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+  }
+
+  void TestStringCase(const std::shared_ptr<Expression>& unbound, bool 
expected_result) {
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+                                /*file_size_in_bytes=*/1024,
+                                /*lower_bounds=*/{{"name", "123"}}, {{"name", 
"456"}},
+                                /*value_counts=*/{{2, 10}}, 
/*null_counts=*/{{2, 0}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+  }
+
+  std::shared_ptr<Schema> schema_;
+};
+
+TEST_F(StrictMetricsEvaluatorTest, CaseSensitiveTest) {
+  {
+    auto unbound = Expressions::Equal("id", Literal::Long(300));
+    auto evaluator = StrictMetricsEvaluator::Make(unbound, schema_, true);
+    ASSERT_TRUE(evaluator.has_value());
+  }
+  {
+    auto unbound = Expressions::Equal("ID", Literal::Long(300));
+    auto evaluator = StrictMetricsEvaluator::Make(unbound, schema_, true);
+    ASSERT_FALSE(evaluator.has_value());
+    ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression);
+  }
+  {
+    auto unbound = Expressions::Equal("ID", Literal::Long(300));
+    auto evaluator = StrictMetricsEvaluator::Make(unbound, schema_, false);
+    ASSERT_TRUE(evaluator.has_value());
+  }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, IsNullTest) {
+  {
+    auto unbound = Expressions::IsNull("name");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, 
{{"name", "2"}},
+                                {{2, 10}}, {{2, 5}}, {});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+  }
+  {
+    auto unbound = Expressions::IsNull("name");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, 
{{"name", "2"}},
+                                {{2, 10}}, {{2, 10}}, {});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+  }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotNullTest) {
+  {
+    auto unbound = Expressions::NotNull("name");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, 
{{"name", "2"}},
+                                {{2, 10}}, {{2, 5}}, {});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+  }
+  {
+    auto unbound = Expressions::NotNull("name");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, 
{{"name", "2"}},
+                                {{2, 10}}, {{2, 0}}, {});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+  }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, IsNanTest) {
+  {
+    auto unbound = Expressions::IsNaN("salary");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+                                {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 
5}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+  }
+  {
+    auto unbound = Expressions::IsNaN("salary");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+                                {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4, 
5}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+  }
+  {
+    auto unbound = Expressions::IsNaN("salary");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+                                {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 
10}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+  }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotNanTest) {
+  {
+    auto unbound = Expressions::NotNaN("salary");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+                                {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMightNotMatch) << unbound->ToString();
+  }
+  {
+    auto unbound = Expressions::NotNaN("salary");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+                                {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 0}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+  }
+  {
+    auto unbound = Expressions::NotNaN("salary");
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}},
+                                {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), kRowsMustMatch) << unbound->ToString();
+  }
+}
+
+TEST_F(StrictMetricsEvaluatorTest, LTTest) {
+  TestCase(Expressions::LessThan("id", Literal::Long(300)), kRowsMustMatch);
+  TestCase(Expressions::LessThan("id", Literal::Long(150)), 
kRowsMightNotMatch);
+  TestCase(Expressions::LessThan("id", Literal::Long(100)), 
kRowsMightNotMatch);
+  TestCase(Expressions::LessThan("id", Literal::Long(200)), 
kRowsMightNotMatch);
+  TestCase(Expressions::LessThan("id", Literal::Long(99)), kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, LTEQTest) {
+  TestCase(Expressions::LessThanOrEqual("id", Literal::Long(300)), 
kRowsMustMatch);
+  TestCase(Expressions::LessThanOrEqual("id", Literal::Long(150)), 
kRowsMightNotMatch);
+  TestCase(Expressions::LessThanOrEqual("id", Literal::Long(100)), 
kRowsMightNotMatch);
+  TestCase(Expressions::LessThanOrEqual("id", Literal::Long(200)), 
kRowsMustMatch);
+  TestCase(Expressions::LessThanOrEqual("id", Literal::Long(99)), 
kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, GTTest) {
+  TestCase(Expressions::GreaterThan("id", Literal::Long(300)), 
kRowsMightNotMatch);
+  TestCase(Expressions::GreaterThan("id", Literal::Long(150)), 
kRowsMightNotMatch);
+  TestCase(Expressions::GreaterThan("id", Literal::Long(100)), 
kRowsMightNotMatch);
+  TestCase(Expressions::GreaterThan("id", Literal::Long(200)), 
kRowsMightNotMatch);
+  TestCase(Expressions::GreaterThan("id", Literal::Long(99)), kRowsMustMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, GTEQTest) {
+  TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(300)), 
kRowsMightNotMatch);
+  TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(150)), 
kRowsMightNotMatch);
+  TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(100)), 
kRowsMustMatch);
+  TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(200)), 
kRowsMightNotMatch);
+  TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(99)), 
kRowsMustMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, EQTest) {
+  TestCase(Expressions::Equal("id", Literal::Long(300)), kRowsMightNotMatch);
+  TestCase(Expressions::Equal("id", Literal::Long(150)), kRowsMightNotMatch);
+  TestCase(Expressions::Equal("id", Literal::Long(100)), kRowsMightNotMatch);
+  TestCase(Expressions::Equal("id", Literal::Long(200)), kRowsMightNotMatch);
+
+  auto test_case = [&](const std::shared_ptr<Expression>& unbound, bool 
expected_result) {
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+                                /*file_size_in_bytes=*/1024,
+                                /*lower_bounds=*/{{"id", 
static_cast<int64_t>(100)}},
+                                /*upper_bounds=*/{{"id", 
static_cast<int64_t>(100)}},
+                                /*value_counts=*/{{1, 10}}, 
/*null_counts=*/{{1, 0}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+  };
+  test_case(Expressions::Equal("id", Literal::Long(100)), kRowsMustMatch);
+  test_case(Expressions::Equal("id", Literal::Long(200)), kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotEqTest) {
+  TestCase(Expressions::NotEqual("id", Literal::Long(300)), kRowsMustMatch);
+  TestCase(Expressions::NotEqual("id", Literal::Long(150)), 
kRowsMightNotMatch);
+  TestCase(Expressions::NotEqual("id", Literal::Long(100)), 
kRowsMightNotMatch);
+  TestCase(Expressions::NotEqual("id", Literal::Long(200)), 
kRowsMightNotMatch);
+  TestCase(Expressions::NotEqual("id", Literal::Long(99)), kRowsMustMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, InTest) {
+  TestCase(Expressions::In("id",
+                           {
+                               Literal::Long(100),
+                               Literal::Long(200),
+                               Literal::Long(300),
+                               Literal::Long(400),
+                               Literal::Long(500),
+                           }),
+           kRowsMightNotMatch);
+
+  auto test_case = [&](const std::shared_ptr<Expression>& unbound, bool 
expected_result) {
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(unbound, schema_, 
true));
+    auto file = PrepareDataFile(/*partition=*/"20251128", /*record_count=*/10,
+                                /*file_size_in_bytes=*/1024,
+                                /*lower_bounds=*/{{"id", 
static_cast<int64_t>(100)}},
+                                /*upper_bounds=*/{{"id", 
static_cast<int64_t>(100)}},
+                                /*value_counts=*/{{1, 10}}, 
/*null_counts=*/{{1, 0}});
+    auto result = evaluator->Evaluate(*file);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(result.value(), expected_result) << unbound->ToString();
+  };
+  test_case(Expressions::In("id", {Literal::Long(100), Literal::Long(200)}),
+            kRowsMustMatch);
+  test_case(Expressions::In("id", {Literal::Long(200), Literal::Long(300)}),
+            kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotInTest) {
+  TestCase(Expressions::NotIn("id",
+                              {
+                                  Literal::Long(88),
+                                  Literal::Long(99),
+                              }),
+           kRowsMustMatch);
+  TestCase(Expressions::NotIn("id",
+                              {
+                                  Literal::Long(288),
+                                  Literal::Long(299),
+                              }),
+           kRowsMustMatch);
+  TestCase(Expressions::NotIn("id",
+                              {
+                                  Literal::Long(88),
+                                  Literal::Long(288),
+                                  Literal::Long(299),
+                              }),
+           kRowsMustMatch);
+  TestCase(Expressions::NotIn("id",
+                              {
+                                  Literal::Long(88),
+                                  Literal::Long(100),
+                              }),
+           kRowsMightNotMatch);
+  TestCase(Expressions::NotIn("id",
+                              {
+                                  Literal::Long(88),
+                                  Literal::Long(101),
+                              }),
+           kRowsMightNotMatch);
+  TestCase(Expressions::NotIn("id",
+                              {
+                                  Literal::Long(100),
+                                  Literal::Long(101),
+                              }),
+           kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, StartsWithTest) {
+  // always true
+  TestStringCase(Expressions::StartsWith("name", "1"), kRowsMightNotMatch);
+}
+
+TEST_F(StrictMetricsEvaluatorTest, NotStartsWithTest) {
+  TestStringCase(Expressions::NotStartsWith("name", "1"), kRowsMightNotMatch);
+}
+
+class StrictMetricsEvaluatorMigratedTest : public StrictMetricsEvaluatorTest {
+ protected:
+  static constexpr int64_t kIntMinValue = 30;
+  static constexpr int64_t kIntMaxValue = 79;
+  static constexpr int64_t kAlwaysFive = 5;
+
+  void SetUp() override {
+    schema_ = std::make_shared<Schema>(
+        std::vector<SchemaField>{
+            SchemaField::MakeRequired(1, "id", int64()),
+            SchemaField::MakeOptional(2, "no_stats", int64()),
+            SchemaField::MakeRequired(3, "required", string()),
+            SchemaField::MakeOptional(4, "all_nulls", string()),
+            SchemaField::MakeOptional(5, "some_nulls", string()),
+            SchemaField::MakeOptional(6, "no_nulls", string()),
+            SchemaField::MakeRequired(7, "always_5", int64()),
+            SchemaField::MakeOptional(8, "all_nans", float64()),
+            SchemaField::MakeOptional(9, "some_nans", float32()),
+            SchemaField::MakeOptional(10, "no_nans", float32()),
+            SchemaField::MakeOptional(11, "all_nulls_double", float64()),
+            SchemaField::MakeOptional(12, "all_nans_v1_stats", float32()),
+            SchemaField::MakeOptional(13, "nan_and_null_only", float64()),
+            SchemaField::MakeOptional(14, "no_nan_stats", float64()),
+            SchemaField::MakeOptional(
+                15, "struct",
+                std::make_shared<StructType>(std::vector<SchemaField>{
+                    SchemaField::MakeOptional(16, "nested_col_no_stats", 
int64()),
+                    SchemaField::MakeOptional(17, "nested_col_with_stats", 
int64())})),
+        },
+        /*schema_id=*/0);
+
+    file_ = MakePrimaryFile();
+    file_with_bounds_ = MakeSomeNullsFile();
+    file_with_equal_bounds_ = MakeSomeNullsEqualBoundsFile();
+  }
+
+  std::shared_ptr<DataFile> MakePrimaryFile() {
+    auto data_file = std::make_shared<DataFile>();
+    data_file->file_path = "file.avro";
+    data_file->file_format = FileFormatType::kParquet;
+    data_file->record_count = 50;
+    data_file->value_counts = {
+        {4, 50L},  {5, 50L},  {6, 50L},  {8, 50L},  {9, 50L},  {10, 50L},
+        {11, 50L}, {12, 50L}, {13, 50L}, {14, 50L}, {17, 50L},
+    };
+    data_file->null_value_counts = {
+        {4, 50L}, {5, 10L}, {6, 0L}, {11, 50L}, {12, 0L}, {13, 1L}, {17, 0L},
+    };
+    data_file->nan_value_counts = {
+        {8, 50L},
+        {9, 10L},
+        {10, 0L},
+    };
+    const float float_nan = std::numeric_limits<float>::quiet_NaN();
+    const double double_nan = std::numeric_limits<double>::quiet_NaN();
+    data_file->lower_bounds = {
+        {1, Literal::Long(kIntMinValue).Serialize().value()},
+        {7, Literal::Long(kAlwaysFive).Serialize().value()},
+        {12, Literal::Float(float_nan).Serialize().value()},
+        {13, Literal::Double(double_nan).Serialize().value()},
+        {17, Literal::Long(kIntMinValue).Serialize().value()},
+    };
+    data_file->upper_bounds = {
+        {1, Literal::Long(kIntMaxValue).Serialize().value()},
+        {7, Literal::Long(kAlwaysFive).Serialize().value()},
+        {12, Literal::Float(float_nan).Serialize().value()},
+        {13, Literal::Double(double_nan).Serialize().value()},
+        {17, Literal::Long(kIntMaxValue).Serialize().value()},
+    };
+    return data_file;
+  }
+
+  std::shared_ptr<DataFile> MakeSomeNullsFile() {
+    auto data_file = std::make_shared<DataFile>();
+    data_file->file_path = "file_2.avro";
+    data_file->file_format = FileFormatType::kParquet;
+    data_file->record_count = 50;
+    data_file->value_counts = {
+        {4, 50L},
+        {5, 50L},
+        {6, 50L},
+        {8, 50L},
+    };
+    data_file->null_value_counts = {
+        {4, 50L},
+        {5, 10L},
+        {6, 0L},
+    };
+    data_file->lower_bounds = {
+        {5, Literal::String("bbb").Serialize().value()},
+    };
+    data_file->upper_bounds = {
+        {5, Literal::String("eee").Serialize().value()},
+    };
+    return data_file;
+  }
+
+  std::shared_ptr<DataFile> MakeSomeNullsEqualBoundsFile() {
+    auto data_file = std::make_shared<DataFile>();
+    data_file->file_path = "file_3.avro";
+    data_file->file_format = FileFormatType::kParquet;
+    data_file->record_count = 50;
+    data_file->value_counts = {
+        {4, 50L},
+        {5, 50L},
+        {6, 50L},
+    };
+    data_file->null_value_counts = {
+        {4, 50L},
+        {5, 10L},
+        {6, 0L},
+    };
+    data_file->lower_bounds = {
+        {5, Literal::String("bbb").Serialize().value()},
+    };
+    data_file->upper_bounds = {
+        {5, Literal::String("bbb").Serialize().value()},
+    };
+    return data_file;
+  }
+
+  std::shared_ptr<DataFile> MakeMissingStatsFile() {
+    auto data_file = std::make_shared<DataFile>();
+    data_file->file_path = "missing.parquet";
+    data_file->file_format = FileFormatType::kParquet;
+    data_file->record_count = 50;
+    return data_file;
+  }
+
+  std::shared_ptr<DataFile> MakeZeroRecordFile() {
+    auto data_file = std::make_shared<DataFile>();
+    data_file->file_path = "zero.parquet";
+    data_file->file_format = FileFormatType::kParquet;
+    data_file->record_count = 0;
+    return data_file;
+  }
+
+  void ExpectShouldRead(const std::shared_ptr<Expression>& expr, bool expected,
+                        std::shared_ptr<DataFile> file = nullptr,
+                        bool case_sensitive = true) {
+    auto target = file ? file : file_;
+    ICEBERG_UNWRAP_OR_FAIL(auto evaluator,
+                           StrictMetricsEvaluator::Make(expr, schema_, 
case_sensitive));
+    auto eval_result = evaluator->Evaluate(*target);
+    ASSERT_TRUE(eval_result.has_value());
+    ASSERT_EQ(eval_result.value(), expected) << expr->ToString();
+  }
+
+  std::shared_ptr<Schema> schema_;
+  std::shared_ptr<DataFile> file_;
+  std::shared_ptr<DataFile> file_with_bounds_;
+  std::shared_ptr<DataFile> file_with_equal_bounds_;
+};
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, AllNulls) {
+  ExpectShouldRead(Expressions::NotNull("all_nulls"), false);
+  ExpectShouldRead(Expressions::NotNull("some_nulls"), false);
+  ExpectShouldRead(Expressions::NotNull("no_nulls"), true);
+  ExpectShouldRead(Expressions::NotEqual("all_nulls", Literal::String("a")), 
true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, NoNulls) {
+  ExpectShouldRead(Expressions::IsNull("all_nulls"), true);
+  ExpectShouldRead(Expressions::IsNull("some_nulls"), false);
+  ExpectShouldRead(Expressions::IsNull("no_nulls"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, SomeNulls) {
+  ExpectShouldRead(Expressions::LessThan("some_nulls", 
Literal::String("ggg")), false,
+                   file_with_bounds_);
+  ExpectShouldRead(Expressions::LessThanOrEqual("some_nulls", 
Literal::String("eee")),
+                   false, file_with_bounds_);
+  ExpectShouldRead(Expressions::GreaterThan("some_nulls", 
Literal::String("aaa")), false,
+                   file_with_bounds_);
+  ExpectShouldRead(Expressions::GreaterThanOrEqual("some_nulls", 
Literal::String("bbb")),
+                   false, file_with_bounds_);
+  ExpectShouldRead(Expressions::Equal("some_nulls", Literal::String("bbb")), 
false,
+                   file_with_equal_bounds_);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IsNaN) {
+  ExpectShouldRead(Expressions::IsNaN("all_nans"), true);
+  ExpectShouldRead(Expressions::IsNaN("some_nans"), false);
+  ExpectShouldRead(Expressions::IsNaN("no_nans"), false);
+  ExpectShouldRead(Expressions::IsNaN("all_nulls_double"), false);
+  ExpectShouldRead(Expressions::IsNaN("no_nan_stats"), false);
+  ExpectShouldRead(Expressions::IsNaN("all_nans_v1_stats"), false);
+  ExpectShouldRead(Expressions::IsNaN("nan_and_null_only"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, NotNaN) {
+  ExpectShouldRead(Expressions::NotNaN("all_nans"), false);
+  ExpectShouldRead(Expressions::NotNaN("some_nans"), false);
+  ExpectShouldRead(Expressions::NotNaN("no_nans"), true);
+  ExpectShouldRead(Expressions::NotNaN("all_nulls_double"), true);
+  ExpectShouldRead(Expressions::NotNaN("no_nan_stats"), false);
+  ExpectShouldRead(Expressions::NotNaN("all_nans_v1_stats"), false);
+  ExpectShouldRead(Expressions::NotNaN("nan_and_null_only"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, RequiredColumn) {
+  ExpectShouldRead(Expressions::NotNull("required"), true);
+  ExpectShouldRead(Expressions::IsNull("required"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, MissingColumn) {
+  auto expr = Expressions::LessThan("missing", Literal::Long(5));
+  auto evaluator = StrictMetricsEvaluator::Make(expr, schema_, true);
+  ASSERT_FALSE(evaluator.has_value());
+  EXPECT_TRUE(evaluator.error().message.contains("Cannot find field 
'missing'"))
+      << evaluator.error().message;
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, MissingStats) {
+  auto missing_stats = MakeMissingStatsFile();
+  std::vector<std::shared_ptr<Expression>> expressions = {
+      Expressions::LessThan("no_stats", Literal::Long(5)),
+      Expressions::LessThanOrEqual("no_stats", Literal::Long(30)),
+      Expressions::Equal("no_stats", Literal::Long(70)),
+      Expressions::GreaterThan("no_stats", Literal::Long(78)),
+      Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)),
+      Expressions::NotEqual("no_stats", Literal::Long(101)),
+      Expressions::IsNull("no_stats"),
+      Expressions::NotNull("no_stats"),
+      Expressions::IsNaN("all_nans"),
+      Expressions::NotNaN("all_nans"),
+  };
+  for (const auto& expr : expressions) {
+    ExpectShouldRead(expr, false, missing_stats);
+  }
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, ZeroRecordFile) {
+  auto zero_record_file = MakeZeroRecordFile();
+  std::vector<std::shared_ptr<Expression>> expressions = {
+      Expressions::LessThan("id", Literal::Long(5)),
+      Expressions::LessThanOrEqual("id", Literal::Long(30)),
+      Expressions::Equal("id", Literal::Long(70)),
+      Expressions::GreaterThan("id", Literal::Long(78)),
+      Expressions::GreaterThanOrEqual("id", Literal::Long(90)),
+      Expressions::NotEqual("id", Literal::Long(101)),
+      Expressions::IsNull("some_nulls"),
+      Expressions::NotNull("some_nulls"),
+      Expressions::IsNaN("all_nans"),
+      Expressions::NotNaN("all_nans"),
+  };
+  for (const auto& expr : expressions) {
+    ExpectShouldRead(expr, true, zero_record_file);
+  }
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, Not) {
+  ExpectShouldRead(
+      Expressions::Not(Expressions::LessThan("id", Literal::Long(kIntMinValue 
- 25))),
+      true);
+  ExpectShouldRead(
+      Expressions::Not(Expressions::GreaterThan("id", 
Literal::Long(kIntMinValue - 25))),
+      false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, And) {
+  ExpectShouldRead(
+      Expressions::And(Expressions::GreaterThan("id", 
Literal::Long(kIntMinValue - 25)),
+                       Expressions::LessThanOrEqual("id", 
Literal::Long(kIntMinValue))),
+      false);
+  ExpectShouldRead(
+      Expressions::And(
+          Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+          Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMinValue - 
30))),
+      false);
+  ExpectShouldRead(
+      Expressions::And(
+          Expressions::LessThan("id", Literal::Long(kIntMaxValue + 6)),
+          Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMinValue - 
30))),
+      true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, Or) {
+  ExpectShouldRead(
+      Expressions::Or(
+          Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+          Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 
1))),
+      false);
+  ExpectShouldRead(
+      Expressions::Or(
+          Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)),
+          Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue - 
19))),
+      false);
+  ExpectShouldRead(
+      Expressions::Or(Expressions::LessThan("id", Literal::Long(kIntMinValue - 
25)),
+                      Expressions::GreaterThanOrEqual("id", 
Literal::Long(kIntMinValue))),
+      true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerLt) {
+  ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMinValue)), 
false);
+  ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMinValue + 
1)), false);
+  ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMaxValue)), 
false);
+  ExpectShouldRead(Expressions::LessThan("id", Literal::Long(kIntMaxValue + 
1)), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerLtEq) {
+  ExpectShouldRead(Expressions::LessThanOrEqual("id", 
Literal::Long(kIntMinValue - 1)),
+                   false);
+  ExpectShouldRead(Expressions::LessThanOrEqual("id", 
Literal::Long(kIntMinValue)),
+                   false);
+  ExpectShouldRead(Expressions::LessThanOrEqual("id", 
Literal::Long(kIntMaxValue)), true);
+  ExpectShouldRead(Expressions::LessThanOrEqual("id", 
Literal::Long(kIntMaxValue + 1)),
+                   true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerGt) {
+  ExpectShouldRead(Expressions::GreaterThan("id", 
Literal::Long(kIntMaxValue)), false);
+  ExpectShouldRead(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue - 
1)),
+                   false);
+  ExpectShouldRead(Expressions::GreaterThan("id", 
Literal::Long(kIntMinValue)), false);
+  ExpectShouldRead(Expressions::GreaterThan("id", Literal::Long(kIntMinValue - 
1)), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerGtEq) {
+  ExpectShouldRead(Expressions::GreaterThanOrEqual("id", 
Literal::Long(kIntMaxValue + 1)),
+                   false);
+  ExpectShouldRead(Expressions::GreaterThanOrEqual("id", 
Literal::Long(kIntMaxValue)),
+                   false);
+  ExpectShouldRead(Expressions::GreaterThanOrEqual("id", 
Literal::Long(kIntMinValue + 1)),
+                   false);
+  ExpectShouldRead(Expressions::GreaterThanOrEqual("id", 
Literal::Long(kIntMinValue)),
+                   true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerEq) {
+  ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMinValue - 25)), 
false);
+  ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMinValue)), 
false);
+  ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMaxValue - 4)), 
false);
+  ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMaxValue)), 
false);
+  ExpectShouldRead(Expressions::Equal("id", Literal::Long(kIntMaxValue + 1)), 
false);
+  ExpectShouldRead(Expressions::Equal("always_5", Literal::Long(kIntMinValue - 
25)),
+                   true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerNotEq) {
+  ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 
25)), true);
+  ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 
1)), true);
+  ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMinValue)), 
false);
+  ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue - 
4)), false);
+  ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue)), 
false);
+  ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 
1)), true);
+  ExpectShouldRead(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 
6)), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerNotEqRewritten) {
+  ExpectShouldRead(
+      Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue - 
25))), true);
+  ExpectShouldRead(
+      Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue - 
1))), true);
+  ExpectShouldRead(
+      Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue))), 
false);
+  ExpectShouldRead(
+      Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue - 
4))), false);
+  ExpectShouldRead(
+      Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue))), 
false);
+  ExpectShouldRead(
+      Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue + 
1))), true);
+  ExpectShouldRead(
+      Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue + 
6))), true);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerIn) {
+  ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMinValue - 25),
+                                          Literal::Long(kIntMinValue - 24)}),
+                   false);
+  ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMinValue - 1),
+                                          Literal::Long(kIntMinValue)}),
+                   false);
+  ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMaxValue - 4),
+                                          Literal::Long(kIntMaxValue - 3)}),
+                   false);
+  ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMaxValue),
+                                          Literal::Long(kIntMaxValue + 1)}),
+                   false);
+  ExpectShouldRead(Expressions::In("id", {Literal::Long(kIntMaxValue + 1),
+                                          Literal::Long(kIntMaxValue + 2)}),
+                   false);
+  ExpectShouldRead(Expressions::In("always_5", {Literal::Long(5), 
Literal::Long(6)}),
+                   true);
+  ExpectShouldRead(
+      Expressions::In("all_nulls", {Literal::String("abc"), 
Literal::String("def")}),
+      false);
+  ExpectShouldRead(
+      Expressions::In("some_nulls", {Literal::String("abc"), 
Literal::String("def")}),
+      false, file_with_equal_bounds_);
+  ExpectShouldRead(
+      Expressions::In("no_nulls", {Literal::String("abc"), 
Literal::String("def")}),
+      false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, IntegerNotIn) {
+  ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMinValue - 25),
+                                             Literal::Long(kIntMinValue - 
24)}),
+                   true);
+  ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMinValue - 1),
+                                             Literal::Long(kIntMinValue)}),
+                   false);
+  ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMaxValue - 4),
+                                             Literal::Long(kIntMaxValue - 3)}),
+                   false);
+  ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMaxValue),
+                                             Literal::Long(kIntMaxValue + 1)}),
+                   false);
+  ExpectShouldRead(Expressions::NotIn("id", {Literal::Long(kIntMaxValue + 1),
+                                             Literal::Long(kIntMaxValue + 2)}),
+                   true);
+  ExpectShouldRead(Expressions::NotIn("always_5", {Literal::Long(5), 
Literal::Long(6)}),
+                   false);
+  ExpectShouldRead(
+      Expressions::NotIn("all_nulls", {Literal::String("abc"), 
Literal::String("def")}),
+      true);
+  ExpectShouldRead(
+      Expressions::NotIn("some_nulls", {Literal::String("abc"), 
Literal::String("def")}),
+      true, file_with_equal_bounds_);
+  ExpectShouldRead(
+      Expressions::NotIn("no_nulls", {Literal::String("abc"), 
Literal::String("def")}),
+      false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, EvaluateOnNestedColumnWithoutStats) 
{
+  
ExpectShouldRead(Expressions::GreaterThanOrEqual("struct.nested_col_no_stats",
+                                                   
Literal::Long(kIntMinValue)),
+                   false);
+  ExpectShouldRead(Expressions::LessThanOrEqual("struct.nested_col_no_stats",
+                                                Literal::Long(kIntMaxValue)),
+                   false);
+  ExpectShouldRead(Expressions::IsNull("struct.nested_col_no_stats"), false);
+  ExpectShouldRead(Expressions::NotNull("struct.nested_col_no_stats"), false);
+}
+
+TEST_F(StrictMetricsEvaluatorMigratedTest, EvaluateOnNestedColumnWithStats) {
+  
ExpectShouldRead(Expressions::GreaterThanOrEqual("struct.nested_col_with_stats",
+                                                   
Literal::Long(kIntMinValue)),
+                   false);
+  ExpectShouldRead(Expressions::LessThanOrEqual("struct.nested_col_with_stats",
+                                                Literal::Long(kIntMaxValue)),
+                   false);
+  ExpectShouldRead(Expressions::IsNull("struct.nested_col_with_stats"), false);
+  ExpectShouldRead(Expressions::NotNull("struct.nested_col_with_stats"), 
false);
+}
+
+}  // namespace iceberg

Reply via email to