This is an automated email from the ASF dual-hosted git repository.
laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new 97929588 fix(table): tighten strict negative metrics checks (#1106)
97929588 is described below
commit 9792958839c4d90551e114382dcff3c7f0f428c9
Author: Minh Vu <[email protected]>
AuthorDate: Thu May 21 10:09:07 2026 +0200
fix(table): tighten strict negative metrics checks (#1106)
## Summary
- use `containsNullsOnly` / `containsNansOnly` for strict `NotEqual` and
`NotIn` metrics evaluation
- remove now-unused strict `canContain*` helpers
- add regression coverage for mixed null/value and mixed NaN/value files
## Why
For strict negative predicates, the evaluator can only prove a whole
file matches from null/NaN metrics when the column contains only nulls
or only NaNs. Mixed null/value or NaN/value files still need to fall
through to bounds checks and remain conservative when bounds do not
prove the predicate.
Fixes #1103.
## Testing
- `go test ./table -run TestEvaluators`
- `go test ./...`
- `git diff --check`
---
table/evaluators.go | 16 ++-------------
table/evaluators_test.go | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 52 insertions(+), 15 deletions(-)
diff --git a/table/evaluators.go b/table/evaluators.go
index cb9eac68..7d5b0e0b 100644
--- a/table/evaluators.go
+++ b/table/evaluators.go
@@ -1436,7 +1436,7 @@ func (m *strictMetricsEval) VisitNotEqual(t
iceberg.BoundTerm, lit iceberg.Liter
field := t.Ref().Field()
fieldID := field.ID
- if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ if m.containsNullsOnly(fieldID) || m.containsNansOnly(fieldID) {
return rowsMustMatch
}
@@ -1518,7 +1518,7 @@ func (m *strictMetricsEval) VisitNotIn(t
iceberg.BoundTerm, s iceberg.Set[iceber
field := t.Ref().Field()
fieldID := field.ID
- if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ if m.containsNullsOnly(fieldID) || m.containsNansOnly(fieldID) {
return rowsMustMatch
}
@@ -1562,12 +1562,6 @@ func (m *strictMetricsEval)
VisitNotStartsWith(iceberg.BoundTerm, iceberg.Litera
return rowsMightNotMatch
}
-func (m *strictMetricsEval) canContainNulls(fieldID int) bool {
- cnt, exists := m.nullCounts[fieldID]
-
- return exists && cnt > 0
-}
-
func (m *strictMetricsEval) mayContainNulls(field iceberg.NestedField) bool {
cnt, exists := m.nullCounts[field.ID]
if !exists {
@@ -1577,12 +1571,6 @@ func (m *strictMetricsEval) mayContainNulls(field
iceberg.NestedField) bool {
return cnt > 0
}
-func (m *strictMetricsEval) canContainNans(fieldID int) bool {
- cnt, exists := m.nanCounts[fieldID]
-
- return exists && cnt > 0
-}
-
func (m *strictMetricsEval) mayContainNans(field iceberg.NestedField) bool {
switch field.Type.(type) {
case iceberg.Float32Type, iceberg.Float64Type:
diff --git a/table/evaluators_test.go b/table/evaluators_test.go
index 969b956e..b99da0c6 100644
--- a/table/evaluators_test.go
+++ b/table/evaluators_test.go
@@ -2429,6 +2429,55 @@ func (suite *StrictMetricsTestSuite)
TestMissingNaNCounts() {
}
}
+func (suite *StrictMetricsTestSuite)
TestNegativePredicatesWithMixedNullsAndNaNs() {
+ var (
+ IntFive, _ = iceberg.Int32Literal(5).MarshalBinary()
+ FltFive, _ = iceberg.Float32Literal(5).MarshalBinary()
+ )
+
+ schema := iceberg.NewSchema(0,
+ iceberg.NestedField{ID: 1, Name: "mixed_nulls", Type:
iceberg.PrimitiveTypes.Int32},
+ iceberg.NestedField{ID: 2, Name: "all_nulls", Type:
iceberg.PrimitiveTypes.Int32},
+ iceberg.NestedField{ID: 3, Name: "mixed_nans", Type:
iceberg.PrimitiveTypes.Float32, Required: true},
+ iceberg.NestedField{ID: 4, Name: "all_nans", Type:
iceberg.PrimitiveTypes.Float32, Required: true},
+ )
+ file := &mockDataFile{
+ path: "file_1.parquet",
+ format: iceberg.ParquetFile,
+ count: 2,
+ valueCounts: map[int]int64{1: 2, 2: 2, 3: 2, 4: 2},
+ nullCounts: map[int]int64{1: 1, 2: 2, 3: 0, 4: 0},
+ nanCounts: map[int]int64{3: 1, 4: 2},
+ lowerBounds: map[int][]byte{1: IntFive, 3: FltFive},
+ upperBounds: map[int][]byte{1: IntFive, 3: FltFive},
+ }
+
+ tests := []struct {
+ expr iceberg.BooleanExpression
+ expected bool
+ msg string
+ }{
+ {iceberg.NotEqualTo(iceberg.Reference("mixed_nulls"),
int32(5)), false, "should skip: mixed nulls do not prove all rows match
notEqual"},
+ {iceberg.NotIn(iceberg.Reference("mixed_nulls"), int32(5)),
false, "should skip: mixed nulls do not prove all rows match notIn"},
+ {iceberg.NotEqualTo(iceberg.Reference("all_nulls"), int32(5)),
true, "should read: all-null column matches notEqual"},
+ {iceberg.NotIn(iceberg.Reference("all_nulls"), int32(5)), true,
"should read: all-null column matches notIn"},
+ {iceberg.NotEqualTo(iceberg.Reference("mixed_nans"),
float32(5)), false, "should skip: mixed NaNs do not prove all rows match
notEqual"},
+ {iceberg.NotIn(iceberg.Reference("mixed_nans"), float32(5)),
false, "should skip: mixed NaNs do not prove all rows match notIn"},
+ {iceberg.NotEqualTo(iceberg.Reference("all_nans"), float32(5)),
true, "should read: all-NaN column matches notEqual"},
+ {iceberg.NotIn(iceberg.Reference("all_nans"), float32(5)),
true, "should read: all-NaN column matches notIn"},
+ }
+
+ for _, tt := range tests {
+ suite.Run(tt.expr.String(), func() {
+ eval, err := newStrictMetricsEvaluator(schema, tt.expr,
true, true)
+ suite.Require().NoError(err)
+ shouldRead, err := eval(file)
+ suite.Require().NoError(err)
+ suite.Equal(tt.expected, shouldRead, tt.msg)
+ })
+ }
+}
+
func (suite *StrictMetricsTestSuite) TestZeroRecordFileStats() {
zeroRecordFile := &mockDataFile{
path: "file_1.parquet",
@@ -2792,7 +2841,7 @@ func (suite *StrictMetricsTestSuite) TestNotInMetrics() {
{iceberg.NotIn(ref, IntMaxValue+1, IntMaxValue+2), true,
"should read: no values == 80 or == 81"},
{iceberg.NotIn(iceberg.Reference("always_5"), int32(5),
int32(6)), false, "should skip: all values == 5"},
{iceberg.NotIn(iceberg.Reference("all_nulls"), "abc", "def"),
true, "should read: notIn on all nulls column"},
- {iceberg.NotIn(iceberg.Reference("some_nulls"), "abc", "def"),
true, "should read: notIn on some nulls column"},
+ {iceberg.NotIn(iceberg.Reference("some_nulls"), "abc", "def"),
false, "should skip: notIn on some nulls column"},
{iceberg.NotIn(iceberg.Reference("no_nulls"), "abc", "def"),
false, "should read: notIn on no nulls column"},
}