This is an automated email from the ASF dual-hosted git repository.

laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 97929588 fix(table): tighten strict negative metrics checks (#1106)
97929588 is described below

commit 9792958839c4d90551e114382dcff3c7f0f428c9
Author: Minh Vu <[email protected]>
AuthorDate: Thu May 21 10:09:07 2026 +0200

    fix(table): tighten strict negative metrics checks (#1106)
    
    ## Summary
    
    - use `containsNullsOnly` / `containsNansOnly` for strict `NotEqual` and
    `NotIn` metrics evaluation
    - remove now-unused strict `canContain*` helpers
    - add regression coverage for mixed null/value and mixed NaN/value files
    
    ## Why
    
    For strict negative predicates, the evaluator can only prove a whole
    file matches from null/NaN metrics when the column contains only nulls
    or only NaNs. Mixed null/value or NaN/value files still need to fall
    through to bounds checks and remain conservative when bounds do not
    prove the predicate.
    
    Fixes #1103.
    
    ## Testing
    
    - `go test ./table -run TestEvaluators`
    - `go test ./...`
    - `git diff --check`
---
 table/evaluators.go      | 16 ++-------------
 table/evaluators_test.go | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/table/evaluators.go b/table/evaluators.go
index cb9eac68..7d5b0e0b 100644
--- a/table/evaluators.go
+++ b/table/evaluators.go
@@ -1436,7 +1436,7 @@ func (m *strictMetricsEval) VisitNotEqual(t 
iceberg.BoundTerm, lit iceberg.Liter
        field := t.Ref().Field()
        fieldID := field.ID
 
-       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+       if m.containsNullsOnly(fieldID) || m.containsNansOnly(fieldID) {
                return rowsMustMatch
        }
 
@@ -1518,7 +1518,7 @@ func (m *strictMetricsEval) VisitNotIn(t 
iceberg.BoundTerm, s iceberg.Set[iceber
        field := t.Ref().Field()
        fieldID := field.ID
 
-       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+       if m.containsNullsOnly(fieldID) || m.containsNansOnly(fieldID) {
                return rowsMustMatch
        }
 
@@ -1562,12 +1562,6 @@ func (m *strictMetricsEval) 
VisitNotStartsWith(iceberg.BoundTerm, iceberg.Litera
        return rowsMightNotMatch
 }
 
-func (m *strictMetricsEval) canContainNulls(fieldID int) bool {
-       cnt, exists := m.nullCounts[fieldID]
-
-       return exists && cnt > 0
-}
-
 func (m *strictMetricsEval) mayContainNulls(field iceberg.NestedField) bool {
        cnt, exists := m.nullCounts[field.ID]
        if !exists {
@@ -1577,12 +1571,6 @@ func (m *strictMetricsEval) mayContainNulls(field 
iceberg.NestedField) bool {
        return cnt > 0
 }
 
-func (m *strictMetricsEval) canContainNans(fieldID int) bool {
-       cnt, exists := m.nanCounts[fieldID]
-
-       return exists && cnt > 0
-}
-
 func (m *strictMetricsEval) mayContainNans(field iceberg.NestedField) bool {
        switch field.Type.(type) {
        case iceberg.Float32Type, iceberg.Float64Type:
diff --git a/table/evaluators_test.go b/table/evaluators_test.go
index 969b956e..b99da0c6 100644
--- a/table/evaluators_test.go
+++ b/table/evaluators_test.go
@@ -2429,6 +2429,55 @@ func (suite *StrictMetricsTestSuite) 
TestMissingNaNCounts() {
        }
 }
 
+func (suite *StrictMetricsTestSuite) 
TestNegativePredicatesWithMixedNullsAndNaNs() {
+       var (
+               IntFive, _ = iceberg.Int32Literal(5).MarshalBinary()
+               FltFive, _ = iceberg.Float32Literal(5).MarshalBinary()
+       )
+
+       schema := iceberg.NewSchema(0,
+               iceberg.NestedField{ID: 1, Name: "mixed_nulls", Type: 
iceberg.PrimitiveTypes.Int32},
+               iceberg.NestedField{ID: 2, Name: "all_nulls", Type: 
iceberg.PrimitiveTypes.Int32},
+               iceberg.NestedField{ID: 3, Name: "mixed_nans", Type: 
iceberg.PrimitiveTypes.Float32, Required: true},
+               iceberg.NestedField{ID: 4, Name: "all_nans", Type: 
iceberg.PrimitiveTypes.Float32, Required: true},
+       )
+       file := &mockDataFile{
+               path:        "file_1.parquet",
+               format:      iceberg.ParquetFile,
+               count:       2,
+               valueCounts: map[int]int64{1: 2, 2: 2, 3: 2, 4: 2},
+               nullCounts:  map[int]int64{1: 1, 2: 2, 3: 0, 4: 0},
+               nanCounts:   map[int]int64{3: 1, 4: 2},
+               lowerBounds: map[int][]byte{1: IntFive, 3: FltFive},
+               upperBounds: map[int][]byte{1: IntFive, 3: FltFive},
+       }
+
+       tests := []struct {
+               expr     iceberg.BooleanExpression
+               expected bool
+               msg      string
+       }{
+               {iceberg.NotEqualTo(iceberg.Reference("mixed_nulls"), 
int32(5)), false, "should skip: mixed nulls do not prove all rows match 
notEqual"},
+               {iceberg.NotIn(iceberg.Reference("mixed_nulls"), int32(5)), 
false, "should skip: mixed nulls do not prove all rows match notIn"},
+               {iceberg.NotEqualTo(iceberg.Reference("all_nulls"), int32(5)), 
true, "should read: all-null column matches notEqual"},
+               {iceberg.NotIn(iceberg.Reference("all_nulls"), int32(5)), true, 
"should read: all-null column matches notIn"},
+               {iceberg.NotEqualTo(iceberg.Reference("mixed_nans"), 
float32(5)), false, "should skip: mixed NaNs do not prove all rows match 
notEqual"},
+               {iceberg.NotIn(iceberg.Reference("mixed_nans"), float32(5)), 
false, "should skip: mixed NaNs do not prove all rows match notIn"},
+               {iceberg.NotEqualTo(iceberg.Reference("all_nans"), float32(5)), 
true, "should read: all-NaN column matches notEqual"},
+               {iceberg.NotIn(iceberg.Reference("all_nans"), float32(5)), 
true, "should read: all-NaN column matches notIn"},
+       }
+
+       for _, tt := range tests {
+               suite.Run(tt.expr.String(), func() {
+                       eval, err := newStrictMetricsEvaluator(schema, tt.expr, 
true, true)
+                       suite.Require().NoError(err)
+                       shouldRead, err := eval(file)
+                       suite.Require().NoError(err)
+                       suite.Equal(tt.expected, shouldRead, tt.msg)
+               })
+       }
+}
+
 func (suite *StrictMetricsTestSuite) TestZeroRecordFileStats() {
        zeroRecordFile := &mockDataFile{
                path:   "file_1.parquet",
@@ -2792,7 +2841,7 @@ func (suite *StrictMetricsTestSuite) TestNotInMetrics() {
                {iceberg.NotIn(ref, IntMaxValue+1, IntMaxValue+2), true, 
"should read: no values == 80 or == 81"},
                {iceberg.NotIn(iceberg.Reference("always_5"), int32(5), 
int32(6)), false, "should skip: all values == 5"},
                {iceberg.NotIn(iceberg.Reference("all_nulls"), "abc", "def"), 
true, "should read: notIn on all nulls column"},
-               {iceberg.NotIn(iceberg.Reference("some_nulls"), "abc", "def"), 
true, "should read: notIn on some nulls column"},
+               {iceberg.NotIn(iceberg.Reference("some_nulls"), "abc", "def"), 
false, "should skip: notIn on some nulls column"},
                {iceberg.NotIn(iceberg.Reference("no_nulls"), "abc", "def"), 
false, "should read: notIn on no nulls column"},
        }
 

Reply via email to