This is an automated email from the ASF dual-hosted git repository. lzljs3620320 pushed a commit to branch release-1.3 in repository https://gitbox.apache.org/repos/asf/paimon.git
commit 71128279c9236c5b599508c0b862929e31dd419c Author: Jingsong Lee <[email protected]> AuthorDate: Mon Oct 20 21:08:02 2025 +0800 [python] Drop stats for manifest entries reading (#6429) --- .../pypaimon/manifest/manifest_file_manager.py | 4 +++- .../pypaimon/manifest/schema/data_file_meta.py | 26 ++++++++++++++++++++++ .../pypaimon/manifest/schema/manifest_entry.py | 10 +++++++++ .../pypaimon/manifest/schema/simple_stats.py | 11 +++++++++ paimon-python/pypaimon/tests/predicates_test.py | 12 +++++----- .../pypaimon/tests/py36/rest_ao_read_write_test.py | 6 +++-- paimon-python/pypaimon/tests/reader_base_test.py | 14 +++++++++--- 7 files changed, 71 insertions(+), 12 deletions(-) diff --git a/paimon-python/pypaimon/manifest/manifest_file_manager.py b/paimon-python/pypaimon/manifest/manifest_file_manager.py index 07e434dd37..bb9251df7e 100644 --- a/paimon-python/pypaimon/manifest/manifest_file_manager.py +++ b/paimon-python/pypaimon/manifest/manifest_file_manager.py @@ -41,7 +41,7 @@ class ManifestFileManager: self.primary_key_fields = self.table.table_schema.get_primary_key_fields() self.trimmed_primary_key_fields = self.table.table_schema.get_trimmed_primary_key_fields() - def read(self, manifest_file_name: str, manifest_entry_filter=None) -> List[ManifestEntry]: + def read(self, manifest_file_name: str, manifest_entry_filter=None, drop_stats=True) -> List[ManifestEntry]: manifest_file_path = self.manifest_path / manifest_file_name entries = [] @@ -107,6 +107,8 @@ class ManifestFileManager: ) if manifest_entry_filter is not None and not manifest_entry_filter(entry): continue + if drop_stats: + entry = entry.copy_without_stats() entries.append(entry) return entries diff --git a/paimon-python/pypaimon/manifest/schema/data_file_meta.py b/paimon-python/pypaimon/manifest/schema/data_file_meta.py index 1d1bcb56fb..8206061c84 100644 --- a/paimon-python/pypaimon/manifest/schema/data_file_meta.py +++ b/paimon-python/pypaimon/manifest/schema/data_file_meta.py @@ -61,6 +61,32 @@ class DataFileMeta: path_builder = path_builder / ("bucket-" + str(bucket)) / self.file_name self.file_path = str(path_builder) + def copy_without_stats(self) -> 'DataFileMeta': + """Create a new DataFileMeta without value statistics.""" + return DataFileMeta( + file_name=self.file_name, + file_size=self.file_size, + row_count=self.row_count, + min_key=self.min_key, + max_key=self.max_key, + key_stats=self.key_stats, + value_stats=SimpleStats.empty_stats(), + min_sequence_number=self.min_sequence_number, + max_sequence_number=self.max_sequence_number, + schema_id=self.schema_id, + level=self.level, + extra_files=self.extra_files, + creation_time=self.creation_time, + delete_row_count=self.delete_row_count, + embedded_index=self.embedded_index, + file_source=self.file_source, + value_stats_cols=[], + external_path=self.external_path, + first_row_id=self.first_row_id, + write_cols=self.write_cols, + file_path=self.file_path + ) + def assign_first_row_id(self, first_row_id: int) -> 'DataFileMeta': """Create a new DataFileMeta with the assigned first_row_id.""" return DataFileMeta( diff --git a/paimon-python/pypaimon/manifest/schema/manifest_entry.py b/paimon-python/pypaimon/manifest/schema/manifest_entry.py index 9608fbbd37..b1fd244daf 100644 --- a/paimon-python/pypaimon/manifest/schema/manifest_entry.py +++ b/paimon-python/pypaimon/manifest/schema/manifest_entry.py @@ -31,6 +31,16 @@ class ManifestEntry: total_buckets: int file: DataFileMeta + def copy_without_stats(self) -> 'ManifestEntry': + """Create a new ManifestEntry without value statistics.""" + return ManifestEntry( + kind=self.kind, + partition=self.partition, + bucket=self.bucket, + total_buckets=self.total_buckets, + file=self.file.copy_without_stats() + ) + def assign_first_row_id(self, first_row_id: int) -> 'ManifestEntry': """Create a new ManifestEntry with the assigned first_row_id.""" return ManifestEntry( diff --git a/paimon-python/pypaimon/manifest/schema/simple_stats.py b/paimon-python/pypaimon/manifest/schema/simple_stats.py index 45982491b9..19816fdd0f 100644 --- a/paimon-python/pypaimon/manifest/schema/simple_stats.py +++ b/paimon-python/pypaimon/manifest/schema/simple_stats.py @@ -18,6 +18,7 @@ from dataclasses import dataclass from typing import List, Optional +from typing import ClassVar from pypaimon.table.row.generic_row import GenericRow @@ -28,6 +29,16 @@ class SimpleStats: max_values: GenericRow null_counts: Optional[List[int]] + _empty_stats: ClassVar[object] = None + + @classmethod + def empty_stats(cls): + if cls._empty_stats is None: + min_values = GenericRow([], []) + max_values = GenericRow([], []) + cls._empty_stats = cls(min_values, max_values, None) + return cls._empty_stats + SIMPLE_STATS_SCHEMA = { "type": "record", diff --git a/paimon-python/pypaimon/tests/predicates_test.py b/paimon-python/pypaimon/tests/predicates_test.py index a3a0e3229c..6158d1d88b 100644 --- a/paimon-python/pypaimon/tests/predicates_test.py +++ b/paimon-python/pypaimon/tests/predicates_test.py @@ -454,20 +454,20 @@ class PredicateTest(unittest.TestCase): if split.partition.values == ["p1", 2]: count += 1 self.assertEqual(len(split.files), 1) - min_values = split.files[0].value_stats.min_values.to_dict() - max_values = split.files[0].value_stats.max_values.to_dict() + min_values = split.files[0].key_stats.min_values.to_dict() + max_values = split.files[0].key_stats.max_values.to_dict() self.assertTrue(min_values["key1"] == 1 and min_values["key2"] == "e" and max_values["key1"] == 4 and max_values["key2"] == "h") elif split.partition.values == ["p2", 2]: count += 1 - min_values = split.files[0].value_stats.min_values.to_dict() - max_values = split.files[0].value_stats.max_values.to_dict() + min_values = split.files[0].key_stats.min_values.to_dict() + max_values = split.files[0].key_stats.max_values.to_dict() self.assertTrue(min_values["key1"] == 5 and min_values["key2"] == "a" and max_values["key1"] == 8 and max_values["key2"] == "d") elif split.partition.values == ["p1", 1]: count += 1 - min_values = split.files[0].value_stats.min_values.to_dict() - max_values = split.files[0].value_stats.max_values.to_dict() + min_values = split.files[0].key_stats.min_values.to_dict() + max_values = split.files[0].key_stats.max_values.to_dict() self.assertTrue(min_values["key1"] == max_values["key1"] == 7 and max_values["key2"] == max_values["key2"] == "b") self.assertEqual(count, 3) diff --git a/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py b/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py index a184f64866..9be66d9759 100644 --- a/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py +++ b/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py @@ -180,7 +180,9 @@ class RESTAOReadWritePy36Test(RESTBaseTest): latest_snapshot = SnapshotManager(table).get_latest_snapshot() manifest_files = table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot) manifest_entries = table_scan.starting_scanner.manifest_file_manager.read( - manifest_files[0].file_name, lambda row: table_scan.starting_scanner._filter_manifest_entry(row)) + manifest_files[0].file_name, + lambda row: table_scan.starting_scanner._filter_manifest_entry(row), + drop_stats=False) min_value_stats = manifest_entries[0].file.value_stats.min_values.values max_value_stats = manifest_entries[0].file.value_stats.max_values.values expected_min_values = [col[0].as_py() for col in expect_data] @@ -849,7 +851,7 @@ class RESTAOReadWritePy36Test(RESTBaseTest): manifest_manager.write(manifest_file_name, [entry]) # Read the manifest entry back - entries = manifest_manager.read(manifest_file_name) + entries = manifest_manager.read(manifest_file_name, drop_stats=False) # Verify we have exactly one entry self.assertEqual(len(entries), 1) diff --git a/paimon-python/pypaimon/tests/reader_base_test.py b/paimon-python/pypaimon/tests/reader_base_test.py index d158c824ef..a06e120e95 100644 --- a/paimon-python/pypaimon/tests/reader_base_test.py +++ b/paimon-python/pypaimon/tests/reader_base_test.py @@ -210,14 +210,22 @@ class ReaderBasicTest(unittest.TestCase): read_builder = table.new_read_builder() table_scan = read_builder.new_scan() table_read = read_builder.new_read() - actual_data = table_read.to_arrow(table_scan.plan().splits()) + splits = table_scan.plan().splits() + + # assert data file without stats + first_file = splits[0].files[0] + self.assertEqual(first_file.value_stats_cols, []) + self.assertEqual(first_file.value_stats, SimpleStats.empty_stats()) + + # assert equal + actual_data = table_read.to_arrow(splits) self.assertEqual(actual_data, expect_data) # to test GenericRow ability latest_snapshot = SnapshotManager(table).get_latest_snapshot() manifest_files = table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot) manifest_entries = table_scan.starting_scanner.manifest_file_manager.read( - manifest_files[0].file_name, lambda row: table_scan.starting_scanner._filter_manifest_entry(row)) + manifest_files[0].file_name, lambda row: table_scan.starting_scanner._filter_manifest_entry(row), False) min_value_stats = manifest_entries[0].file.value_stats.min_values.values max_value_stats = manifest_entries[0].file.value_stats.max_values.values expected_min_values = [col[0].as_py() for col in expect_data] @@ -627,7 +635,7 @@ class ReaderBasicTest(unittest.TestCase): manifest_manager.write(manifest_file_name, [entry]) # Read the manifest entry back - entries = manifest_manager.read(manifest_file_name) + entries = manifest_manager.read(manifest_file_name, drop_stats=False) # Verify we have exactly one entry self.assertEqual(len(entries), 1)
