[GitHub] [iceberg] Fokko commented on a diff in pull request #6714: Python: Filter on Datafile metrics

via GitHub Wed, 01 Feb 2023 17:13:38 -0800


Fokko commented on code in PR #6714:
URL: https://github.com/apache/iceberg/pull/6714#discussion_r1093907811



##########
python/tests/expressions/test_visitors.py:
##########
@@ -1472,3 +1478,211 @@ def test_dnf_to_dask(table_schema_simple: Schema) -> 
None:
         ),
     )
     assert expression_to_plain_format(expr) == [[("foo", ">", "hello")], 
[("bar", "in", {1, 2, 3}), ("baz", "==", True)]]
+
+
[email protected]
+def schema_data_file() -> Schema:
+    return Schema(
+        NestedField(1, "all_nan", DoubleType(), required=True),
+        NestedField(2, "max_nan", DoubleType(), required=True),
+        NestedField(3, "min_max_nan", FloatType(), required=False),
+        NestedField(4, "all_nan_null_bounds", DoubleType(), required=True),
+        NestedField(5, "some_nan_correct_bounds", FloatType(), required=False),
+    )
+
+
[email protected]
+def data_file() -> DataFile:
+    return DataFile(
+        file_path="file.avro",
+        file_format=FileFormat.PARQUET,
+        partition={},
+        record_count=50,
+        file_size_in_bytes=3,
+        column_sizes={
+            1: 10,
+            2: 10,
+            3: 10,
+            4: 10,
+            5: 10,
+        },
+        value_counts={
+            1: 10,
+            2: 10,
+            3: 10,
+            4: 10,
+            5: 10,
+        },
+        null_value_counts={
+            1: 0,
+            2: 0,
+            3: 0,
+            4: 0,
+            5: 0,
+        },
+        nan_value_counts={1: 10, 4: 10, 5: 5},
+        lower_bounds={
+            1: to_bytes(DoubleType(), float("nan")),
+            2: to_bytes(DoubleType(), 7),
+            3: to_bytes(FloatType(), float("nan")),
+            5: to_bytes(FloatType(), 7),
+        },
+        upper_bounds={
+            1: to_bytes(DoubleType(), float("nan")),
+            2: to_bytes(DoubleType(), float("nan")),
+            3: to_bytes(FloatType(), float("nan")),
+            5: to_bytes(FloatType(), 22),
+        },
+    )
+
+
+def 
test_inclusive_metrics_evaluator_less_than_and_less_than_equal(schema_data_file:
 Schema, data_file: DataFile) -> None:
+    for operator in [LessThan, LessThanOrEqual]:
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("all_nan", 1)).eval(data_file)
+        assert not should_read, "Should not match: all nan column doesn't 
contain number"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("max_nan", 1)).eval(data_file)
+        assert not should_read, "Should not match: 1 is smaller than lower 
bound"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("max_nan", 10)).eval(data_file)
+        assert should_read, "Should match: 10 is larger than lower bound"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("min_max_nan", 1)).eval(data_file)
+        assert should_read, "Should match: no visibility"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("all_nan_null_bounds", 1)).eval(data_file)
+        assert not should_read, "Should not match: all nan column doesn't 
contain number"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("some_nan_correct_bounds", 1)).eval(data_file)
+        assert not should_read, "Should not match: 1 is smaller than lower 
bound"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("some_nan_correct_bounds", 10)).eval(data_file)
+        assert should_read, "Should match: 10 larger than lower bound"
+
+
+def 
test_inclusive_metrics_evaluator_greater_than_and_greater_than_equal(schema_data_file:
 Schema, data_file: DataFile) -> None:
+    for operator in [GreaterThan, GreaterThanOrEqual]:
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("all_nan", 1)).eval(data_file)
+        assert not should_read, "Should not match: all nan column doesn't 
contain number"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("max_nan", 1)).eval(data_file)
+        assert should_read, "Should match: upper bound is larger than 1"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("max_nan", 10)).eval(data_file)
+        assert should_read, "Should match: upper bound is larger than 10"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("min_max_nan", 1)).eval(data_file)
+        assert should_read, "Should match: no visibility"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("all_nan_null_bounds", 1)).eval(data_file)
+        assert not should_read, "Should not match: all nan column doesn't 
contain number"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("some_nan_correct_bounds", 1)).eval(data_file)
+        assert should_read, "Should match: 1 is smaller than upper bound"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("some_nan_correct_bounds", 10)).eval(data_file)
+        assert should_read, "Should match: 10 is smaller than upper bound"
+
+        should_read = _InclusiveMetricsEvaluator(schema_data_file, 
operator("all_nan", 30)).eval(data_file)
+        assert not should_read, "Should not match: 30 is greater than upper 
bound"
+
+
+def test_inclusive_metrics_evaluator_equals(schema_data_file: Schema, 
data_file: DataFile) -> None:
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("all_nan", 1)).eval(data_file)
+    assert not should_read, "Should not match: all nan column doesn't contain 
number"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("max_nan", 1)).eval(data_file)
+    assert not should_read, "Should not match: 1 is smaller than lower bound"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("max_nan", 10)).eval(data_file)
+    assert should_read, "Should match: 10 is within bounds"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("min_max_nan", 1)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("all_nan_null_bounds", 1)).eval(data_file)
+    assert not should_read, "Should not match: all nan column doesn't contain 
number"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("some_nan_correct_bounds", 1)).eval(data_file)
+    assert not should_read, "Should not match: 1 is smaller than lower bound"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("some_nan_correct_bounds", 10)).eval(data_file)
+    assert should_read, "Should match: 10 is within bounds"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
EqualTo("all_nan", 30)).eval(data_file)
+    assert not should_read, "Should not match: 30 is greater than upper bound"
+
+
+def test_inclusive_metrics_evaluator_not_equals(schema_data_file: Schema, 
data_file: DataFile) -> None:
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("all_nan", 1)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("max_nan", 10)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("max_nan", 10)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("min_max_nan", 1)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("all_nan_null_bounds", 1)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("some_nan_correct_bounds", 1)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("some_nan_correct_bounds", 10)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
NotEqualTo("some_nan_correct_bounds", 30)).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+
+def test_inclusive_metrics_evaluator_in(schema_data_file: Schema, data_file: 
DataFile) -> None:
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("all_nan", 
(1, 10, 30))).eval(data_file)
+    assert not should_read, "Should not match: all nan column doesn't contain 
number"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("max_nan", 
(1, 10, 30))).eval(data_file)
+    assert should_read, "Should match: 10 and 30 are greater than lower bound"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
In("min_max_nan", (1, 10, 30))).eval(data_file)
+    assert should_read, "Should match: no visibility"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
In("all_nan_null_bounds", (1, 10, 30))).eval(data_file)
+    assert not should_read, "Should not match: all nan column doesn't contain 
number"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
In("some_nan_correct_bounds", (1, 10, 30))).eval(data_file)
+    assert should_read, "Should match: 10 within bounds"
+
+    should_read = _InclusiveMetricsEvaluator(schema_data_file, 
In("some_nan_correct_bounds", (1, 30))).eval(data_file)
+    assert not should_read, "Should not match: 1 not within bounds"

Review Comment:
   Good call, just copied this from Java :)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] Fokko commented on a diff in pull request #6714: Python: Filter on Datafile metrics

Reply via email to