This is an automated email from the ASF dual-hosted git repository.
russellspitzer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new 157b2488eb API: Optimize NOT IN and != predicates for single-value
partition manifests (#15064)
157b2488eb is described below
commit 157b2488ebfc138370bf482458df96bcdc7245dd
Author: Joy Haldar <[email protected]>
AuthorDate: Fri Jan 23 00:31:33 2026 +0530
API: Optimize NOT IN and != predicates for single-value partition manifests
(#15064)
---
.../iceberg/expressions/ManifestEvaluator.java | 53 +++++++++++
.../TestInclusiveManifestEvaluator.java | 104 ++++++++++++++++++++-
2 files changed, 155 insertions(+), 2 deletions(-)
diff --git
a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
index fc3d394203..21762946ca 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
@@ -269,6 +269,14 @@ public class ManifestEvaluator {
public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
// because the bounds are not necessarily a min or max value, this
cannot be answered using
// them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value
in col.
+ // However, when lower == upper and the manifest has no nulls or NaN
values, we can safely
+ // prune if that value equals the literal.
+ T value = uniqueValue(ref);
+
+ if (value != null && lit.comparator().compare(value, lit.value()) == 0) {
+ return ROWS_CANNOT_MATCH;
+ }
+
return ROWS_MIGHT_MATCH;
}
@@ -313,6 +321,14 @@ public class ManifestEvaluator {
public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {
// because the bounds are not necessarily a min or max value, this
cannot be answered using
// them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a
value in col.
+ // However, when lower == upper and the manifest has no nulls or NaN
values, we can safely
+ // prune if that value is in the exclusion set.
+ T value = uniqueValue(ref);
+
+ if (value != null && literalSet.contains(value)) {
+ return ROWS_CANNOT_MATCH;
+ }
+
return ROWS_MIGHT_MATCH;
}
@@ -400,6 +416,43 @@ public class ManifestEvaluator {
return ROWS_MIGHT_MATCH;
}
+ /**
+ * Returns the partition field's single value if all partitions contain
the same value. Defined
+ * as a partition field with no nulls, no NaNs (for floating-point types),
and lower bound
+ * equals upper bound. Returns null otherwise.
+ */
+ private <T> T uniqueValue(BoundReference<T> ref) {
+ int pos = Accessors.toPosition(ref.accessor());
+ PartitionFieldSummary fieldStats = stats.get(pos);
+
+ if (fieldStats.containsNull()) {
+ return null;
+ }
+
+ Type.TypeID typeId = ref.type().typeId();
+ if (Type.TypeID.FLOAT.equals(typeId) ||
Type.TypeID.DOUBLE.equals(typeId)) {
+ if (fieldStats.containsNaN() == null || fieldStats.containsNaN()) {
+ return null;
+ }
+ }
+
+ ByteBuffer lowerBound = fieldStats.lowerBound();
+ ByteBuffer upperBound = fieldStats.upperBound();
+
+ if (lowerBound == null || upperBound == null) {
+ return null;
+ }
+
+ T lower = Conversions.fromByteBuffer(ref.type(), lowerBound);
+ T upper = Conversions.fromByteBuffer(ref.type(), upperBound);
+
+ if (ref.comparator().compare(lower, upper) != 0) {
+ return null;
+ }
+
+ return lower;
+ }
+
private boolean allValuesAreNull(PartitionFieldSummary summary,
Type.TypeID typeId) {
// containsNull encodes whether at least one partition value is null,
// lowerBound is null if all partition values are null
diff --git
a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
index 6c4944e9cd..78e6064eb4 100644
---
a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
+++
b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
@@ -66,7 +66,10 @@ public class TestInclusiveManifestEvaluator {
optional(12, "no_nan_or_null", Types.DoubleType.get()),
optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()),
optional(14, "all_same_value_or_null", Types.StringType.get()),
- optional(15, "no_nulls_same_value_a", Types.StringType.get()));
+ optional(15, "no_nulls_same_value_a", Types.StringType.get()),
+ optional(16, "single_value_with_nan", Types.FloatType.get()),
+ optional(17, "single_value_nan_unknown", Types.FloatType.get()),
+ optional(18, "single_value_no_nan", Types.FloatType.get()));
private static final PartitionSpec SPEC =
PartitionSpec.builderFor(SCHEMA)
@@ -84,6 +87,9 @@ public class TestInclusiveManifestEvaluator {
.identity("all_nulls_missing_nan_float")
.identity("all_same_value_or_null")
.identity("no_nulls_same_value_a")
+ .identity("single_value_with_nan")
+ .identity("single_value_nan_unknown")
+ .identity("single_value_no_nan")
.build();
private static final int INT_MIN_VALUE = 30;
@@ -128,7 +134,21 @@ public class TestInclusiveManifestEvaluator {
toByteBuffer(Types.FloatType.get(), 20F)),
new TestHelpers.TestFieldSummary(true, null, null),
new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN),
- new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN)),
+ new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN),
+ new TestHelpers.TestFieldSummary(
+ false,
+ true,
+ toByteBuffer(Types.FloatType.get(), 5.0F),
+ toByteBuffer(Types.FloatType.get(), 5.0F)),
+ new TestHelpers.TestFieldSummary(
+ false,
+ toByteBuffer(Types.FloatType.get(), 5.0F),
+ toByteBuffer(Types.FloatType.get(), 5.0F)),
+ new TestHelpers.TestFieldSummary(
+ false,
+ false,
+ toByteBuffer(Types.FloatType.get(), 5.0F),
+ toByteBuffer(Types.FloatType.get(), 5.0F))),
null);
@Test
@@ -753,4 +773,84 @@ public class TestInclusiveManifestEvaluator {
ManifestEvaluator.forRowFilter(notIn("no_nulls", "abc", "def"), SPEC,
true).eval(FILE);
assertThat(shouldRead).as("Should read: notIn on no nulls
column").isTrue();
}
+
+ @Test
+ public void testNotEqWithSingleValue() {
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(notEqual("no_nulls", "a"), SPEC,
true).eval(FILE);
+ assertThat(shouldRead).as("Should read: manifest has range of
values").isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "a"),
SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: manifest contains single value equal to literal")
+ .isFalse();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notEqual("no_nulls_same_value_a", "b"),
SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: manifest contains single value not equal to literal")
+ .isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notEqual("all_same_value_or_null",
"a"), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: manifest has nulls which match !=
predicate").isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notEqual("single_value_with_nan",
5.0F), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: manifest has NaN values which match != predicate")
+ .isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notEqual("single_value_nan_unknown",
5.0F), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: manifest has unknown NaN
info").isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notEqual("single_value_no_nan", 5.0F),
SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: manifest contains single float value with no
NaNs")
+ .isFalse();
+ }
+
+ @Test
+ public void testNotInWithSingleValue() {
+ boolean shouldRead =
+ ManifestEvaluator.forRowFilter(notIn("no_nulls", "a", "b"), SPEC,
true).eval(FILE);
+ assertThat(shouldRead).as("Should read: manifest has range of
values").isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "a",
"b"), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: manifest contains single value in exclusion
list")
+ .isFalse();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notIn("no_nulls_same_value_a", "b",
"c"), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: manifest contains single value not in exclusion
list")
+ .isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notIn("all_same_value_or_null", "a",
"b"), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: manifest has nulls which match NOT IN predicate")
+ .isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notIn("single_value_with_nan", 5.0F,
10.0F), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should read: manifest has NaN values which match NOT IN
predicate")
+ .isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notIn("single_value_nan_unknown", 5.0F,
10.0F), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead).as("Should read: manifest has unknown NaN
info").isTrue();
+ shouldRead =
+ ManifestEvaluator.forRowFilter(notIn("single_value_no_nan", 5.0F,
10.0F), SPEC, true)
+ .eval(FILE);
+ assertThat(shouldRead)
+ .as("Should not read: manifest contains single float value with no
NaNs")
+ .isFalse();
+ }
}