This is an automated email from the ASF dual-hosted git repository.
kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new ab43c6ca fix `KeyError` raised by `add_files` when parquet file doe
not have column stats (#1354)
ab43c6ca is described below
commit ab43c6ca5d844a90b30473fb1aa02b3ff34870ed
Author: Binayak Dasgupta <[email protected]>
AuthorDate: Tue Nov 26 00:58:37 2024 +0800
fix `KeyError` raised by `add_files` when parquet file doe not have column
stats (#1354)
* fix KeyError, by switching del to pop
* added unit test
* update test
* fix python 3.9 compatibility, and refactor test
* update test
---
pyiceberg/io/pyarrow.py | 4 ++--
tests/io/test_pyarrow_stats.py | 43 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index d2c4a601..bd4e969d 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata(
split_offsets.sort()
for field_id in invalidate_col:
- del col_aggs[field_id]
- del null_value_counts[field_id]
+ col_aggs.pop(field_id, None)
+ null_value_counts.pop(field_id, None)
return DataFileStatistics(
record_count=parquet_metadata.num_rows,
diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py
index 41f1432d..78889171 100644
--- a/tests/io/test_pyarrow_stats.py
+++ b/tests/io/test_pyarrow_stats.py
@@ -81,7 +81,9 @@ class TestStruct:
y: Optional[float]
-def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1,
TableMetadataV2]]:
+def construct_test_table(
+ write_statistics: Union[bool, List[str]] = True,
+) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
table_metadata = {
"format-version": 2,
"location": "s3://bucket/test/location",
@@ -169,7 +171,9 @@ def construct_test_table() -> Tuple[pq.FileMetaData,
Union[TableMetadataV1, Tabl
metadata_collector: List[Any] = []
with pa.BufferOutputStream() as f:
- with pq.ParquetWriter(f, table.schema,
metadata_collector=metadata_collector) as writer:
+ with pq.ParquetWriter(
+ f, table.schema, metadata_collector=metadata_collector,
write_statistics=write_statistics
+ ) as writer:
writer.write_table(table)
return metadata_collector[0], table_metadata
@@ -681,6 +685,41 @@ def test_stats_types(table_schema_nested: Schema) -> None:
]
+def test_read_missing_statistics() -> None:
+ # write statistics for only for "strings" column
+ metadata, table_metadata =
construct_test_table(write_statistics=["strings"])
+
+ # expect only "strings" column to have statistics in metadata
+ # and all other columns to have no statistics
+ for r in range(metadata.num_row_groups):
+ for pos in range(metadata.num_columns):
+ if metadata.row_group(r).column(pos).path_in_schema == "strings":
+ assert metadata.row_group(r).column(pos).is_stats_set is True
+ assert metadata.row_group(r).column(pos).statistics is not None
+ else:
+ assert metadata.row_group(r).column(pos).is_stats_set is False
+ assert metadata.row_group(r).column(pos).statistics is None
+
+ schema = get_current_schema(table_metadata)
+ statistics = data_file_statistics_from_parquet_metadata(
+ parquet_metadata=metadata,
+ stats_columns=compute_statistics_plan(schema,
table_metadata.properties),
+ parquet_column_mapping=parquet_path_to_id_mapping(schema),
+ )
+
+ datafile = DataFile(**statistics.to_serialized_dict())
+
+ # expect only "strings" column values to be reflected in the
+ # upper_bound, lower_bound and null_value_counts props of datafile
+ string_col_idx = 1
+ assert len(datafile.lower_bounds) == 1
+ assert datafile.lower_bounds[string_col_idx].decode() == "aaaaaaaaaaaaaaaa"
+ assert len(datafile.upper_bounds) == 1
+ assert datafile.upper_bounds[string_col_idx].decode() == "zzzzzzzzzzzzzzz{"
+ assert len(datafile.null_value_counts) == 1
+ assert datafile.null_value_counts[string_col_idx] == 1
+
+
# This is commented out for now because write_to_dataset drops the partition
# columns making it harder to calculate the mapping from the column index to
# datatype id