This is an automated email from the ASF dual-hosted git repository.
liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 4f5274a8a feat(datafusion): support isnan predicate pushdown to
Iceberg (#2142)
4f5274a8a is described below
commit 4f5274a8afd835e746167e48683207b71b20115a
Author: Kaiqi Dong <[email protected]>
AuthorDate: Thu Feb 26 02:38:34 2026 +0100
feat(datafusion): support isnan predicate pushdown to Iceberg (#2142)
## Which issue does this PR close?
Part of the ongoing effort to improve predicate pushdown coverage in the
DataFusion integration.
- Closes #2143
## What changes are included in this PR?
This PR adds support for pushing down `isnan()` predicates from
DataFusion to Iceberg's native `IsNan` / `NotNan` predicate operators.
In DataFusion, `isnan()` is represented as a scalar function
(`Expr::ScalarFunction`) rather than a dedicated `Expr` variant (unlike
`IsNull` / `IsNotNull`). This PR introduces a new
`scalar_function_to_iceberg_predicate` helper in `expr_to_predicate.rs`
that matches scalar functions by name at runtime and converts
`isnan(col)` into `Predicate::Unary(IsNan, col)`.
Negation (`NOT isnan(col)`) is handled automatically: the existing
`Expr::Not` arm wraps the result in `Predicate::Not(...)`, and Iceberg's
downstream `rewrite_not` visitor normalizes it into
`Predicate::Unary(NotNan, col)`.
This enables file pruning using `nan_value_counts` in manifest metadata
for float/double columns, as defined in the [Iceberg spec — Manifest
Files: field summaries and column
statistics](https://iceberg.apache.org/spec/#manifests).
## Are these changes tested?
Yes
---
.../src/physical_plan/expr_to_predicate.rs | 58 ++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git
a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
index 9f37345f8..17c9416d5 100644
--- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
+++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
@@ -18,6 +18,7 @@
use std::vec;
use datafusion::arrow::datatypes::DataType;
+use datafusion::logical_expr::expr::ScalarFunction;
use datafusion::logical_expr::{Expr, Like, Operator};
use datafusion::scalar::ScalarValue;
use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference,
UnaryExpression};
@@ -196,6 +197,9 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult {
TransformedResult::NotTransformed
}
}
+ Expr::ScalarFunction(ScalarFunction { func, args }) => {
+ scalar_function_to_iceberg_predicate(func.name(), args)
+ }
_ => TransformedResult::NotTransformed,
}
}
@@ -216,6 +220,25 @@ fn to_iceberg_operation(op: Operator) ->
OpTransformedResult {
}
}
+/// Translates a DataFusion scalar function into an Iceberg predicate.
+/// Unlike dedicated Expr variants (e.g. `Expr::IsNull`), scalar functions are
+/// identified by name at runtime, so we need to handle them here.
+fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) ->
TransformedResult {
+ match func_name {
+ // TODO: support complex expression arguments to scalar functions
+ "isnan" if args.len() == 1 => {
+ let operand = to_iceberg_predicate(&args[0]);
+ match operand {
+ TransformedResult::Column(r) =>
TransformedResult::Predicate(Predicate::Unary(
+ UnaryExpression::new(PredicateOperator::IsNan, r),
+ )),
+ _ => TransformedResult::NotTransformed,
+ }
+ }
+ _ => TransformedResult::NotTransformed,
+ }
+}
+
fn to_iceberg_and_predicate(
left: TransformedResult,
right: TransformedResult,
@@ -324,6 +347,10 @@ mod tests {
Field::new("ts", DataType::Timestamp(TimeUnit::Second, None),
true).with_metadata(
HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(),
"3".to_string())]),
),
+ Field::new("qux", DataType::Float64,
true).with_metadata(HashMap::from([(
+ PARQUET_FIELD_ID_META_KEY.to_string(),
+ "4".to_string(),
+ )])),
]);
DFSchema::try_from_qualified_schema("my_table", &arrow_schema).unwrap()
}
@@ -681,4 +708,35 @@ mod tests {
Reference::new("bar").starts_with(Datum::string("测试"))
);
}
+
+ #[test]
+ fn test_predicate_conversion_with_isnan() {
+ let predicate = convert_to_iceberg_predicate("isnan(qux)").unwrap();
+ assert_eq!(predicate, Reference::new("qux").is_nan());
+ }
+
+ #[test]
+ fn test_predicate_conversion_with_not_isnan() {
+ let predicate = convert_to_iceberg_predicate("NOT
isnan(qux)").unwrap();
+ assert_eq!(predicate, !Reference::new("qux").is_nan());
+ }
+
+ #[test]
+ fn test_predicate_conversion_with_isnan_and_other_condition() {
+ let sql = "isnan(qux) AND foo > 1";
+ let predicate = convert_to_iceberg_predicate(sql).unwrap();
+ let expected_predicate = Predicate::and(
+ Reference::new("qux").is_nan(),
+ Reference::new("foo").greater_than(Datum::long(1)),
+ );
+ assert_eq!(predicate, expected_predicate);
+ }
+
+ #[test]
+ fn test_predicate_conversion_with_isnan_unsupported_arg() {
+ // isnan on a complex expression (not a bare column) cannot be pushed
down
+ let sql = "isnan(qux + 1)";
+ let predicate = convert_to_iceberg_predicate(sql);
+ assert_eq!(predicate, None);
+ }
}