This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 3d3b09734e [python] Drop stats for manifest entries reading (#6429)
3d3b09734e is described below
commit 3d3b09734e5c495c7906bb6775270be504d85ecc
Author: Jingsong Lee <[email protected]>
AuthorDate: Mon Oct 20 21:08:02 2025 +0800
[python] Drop stats for manifest entries reading (#6429)
---
.../pypaimon/manifest/manifest_file_manager.py | 4 +++-
.../pypaimon/manifest/schema/data_file_meta.py | 26 ++++++++++++++++++++++
.../pypaimon/manifest/schema/manifest_entry.py | 10 +++++++++
.../pypaimon/manifest/schema/simple_stats.py | 11 +++++++++
paimon-python/pypaimon/tests/predicates_test.py | 12 +++++-----
.../pypaimon/tests/py36/rest_ao_read_write_test.py | 6 +++--
paimon-python/pypaimon/tests/reader_base_test.py | 14 +++++++++---
7 files changed, 71 insertions(+), 12 deletions(-)
diff --git a/paimon-python/pypaimon/manifest/manifest_file_manager.py
b/paimon-python/pypaimon/manifest/manifest_file_manager.py
index 07e434dd37..bb9251df7e 100644
--- a/paimon-python/pypaimon/manifest/manifest_file_manager.py
+++ b/paimon-python/pypaimon/manifest/manifest_file_manager.py
@@ -41,7 +41,7 @@ class ManifestFileManager:
self.primary_key_fields =
self.table.table_schema.get_primary_key_fields()
self.trimmed_primary_key_fields =
self.table.table_schema.get_trimmed_primary_key_fields()
- def read(self, manifest_file_name: str, manifest_entry_filter=None) ->
List[ManifestEntry]:
+ def read(self, manifest_file_name: str, manifest_entry_filter=None,
drop_stats=True) -> List[ManifestEntry]:
manifest_file_path = self.manifest_path / manifest_file_name
entries = []
@@ -107,6 +107,8 @@ class ManifestFileManager:
)
if manifest_entry_filter is not None and not
manifest_entry_filter(entry):
continue
+ if drop_stats:
+ entry = entry.copy_without_stats()
entries.append(entry)
return entries
diff --git a/paimon-python/pypaimon/manifest/schema/data_file_meta.py
b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
index 1d1bcb56fb..8206061c84 100644
--- a/paimon-python/pypaimon/manifest/schema/data_file_meta.py
+++ b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
@@ -61,6 +61,32 @@ class DataFileMeta:
path_builder = path_builder / ("bucket-" + str(bucket)) /
self.file_name
self.file_path = str(path_builder)
+ def copy_without_stats(self) -> 'DataFileMeta':
+ """Create a new DataFileMeta without value statistics."""
+ return DataFileMeta(
+ file_name=self.file_name,
+ file_size=self.file_size,
+ row_count=self.row_count,
+ min_key=self.min_key,
+ max_key=self.max_key,
+ key_stats=self.key_stats,
+ value_stats=SimpleStats.empty_stats(),
+ min_sequence_number=self.min_sequence_number,
+ max_sequence_number=self.max_sequence_number,
+ schema_id=self.schema_id,
+ level=self.level,
+ extra_files=self.extra_files,
+ creation_time=self.creation_time,
+ delete_row_count=self.delete_row_count,
+ embedded_index=self.embedded_index,
+ file_source=self.file_source,
+ value_stats_cols=[],
+ external_path=self.external_path,
+ first_row_id=self.first_row_id,
+ write_cols=self.write_cols,
+ file_path=self.file_path
+ )
+
def assign_first_row_id(self, first_row_id: int) -> 'DataFileMeta':
"""Create a new DataFileMeta with the assigned first_row_id."""
return DataFileMeta(
diff --git a/paimon-python/pypaimon/manifest/schema/manifest_entry.py
b/paimon-python/pypaimon/manifest/schema/manifest_entry.py
index 9608fbbd37..b1fd244daf 100644
--- a/paimon-python/pypaimon/manifest/schema/manifest_entry.py
+++ b/paimon-python/pypaimon/manifest/schema/manifest_entry.py
@@ -31,6 +31,16 @@ class ManifestEntry:
total_buckets: int
file: DataFileMeta
+ def copy_without_stats(self) -> 'ManifestEntry':
+ """Create a new ManifestEntry without value statistics."""
+ return ManifestEntry(
+ kind=self.kind,
+ partition=self.partition,
+ bucket=self.bucket,
+ total_buckets=self.total_buckets,
+ file=self.file.copy_without_stats()
+ )
+
def assign_first_row_id(self, first_row_id: int) -> 'ManifestEntry':
"""Create a new ManifestEntry with the assigned first_row_id."""
return ManifestEntry(
diff --git a/paimon-python/pypaimon/manifest/schema/simple_stats.py
b/paimon-python/pypaimon/manifest/schema/simple_stats.py
index 45982491b9..19816fdd0f 100644
--- a/paimon-python/pypaimon/manifest/schema/simple_stats.py
+++ b/paimon-python/pypaimon/manifest/schema/simple_stats.py
@@ -18,6 +18,7 @@
from dataclasses import dataclass
from typing import List, Optional
+from typing import ClassVar
from pypaimon.table.row.generic_row import GenericRow
@@ -28,6 +29,16 @@ class SimpleStats:
max_values: GenericRow
null_counts: Optional[List[int]]
+ _empty_stats: ClassVar[object] = None
+
+ @classmethod
+ def empty_stats(cls):
+ if cls._empty_stats is None:
+ min_values = GenericRow([], [])
+ max_values = GenericRow([], [])
+ cls._empty_stats = cls(min_values, max_values, None)
+ return cls._empty_stats
+
SIMPLE_STATS_SCHEMA = {
"type": "record",
diff --git a/paimon-python/pypaimon/tests/predicates_test.py
b/paimon-python/pypaimon/tests/predicates_test.py
index a3a0e3229c..6158d1d88b 100644
--- a/paimon-python/pypaimon/tests/predicates_test.py
+++ b/paimon-python/pypaimon/tests/predicates_test.py
@@ -454,20 +454,20 @@ class PredicateTest(unittest.TestCase):
if split.partition.values == ["p1", 2]:
count += 1
self.assertEqual(len(split.files), 1)
- min_values = split.files[0].value_stats.min_values.to_dict()
- max_values = split.files[0].value_stats.max_values.to_dict()
+ min_values = split.files[0].key_stats.min_values.to_dict()
+ max_values = split.files[0].key_stats.max_values.to_dict()
self.assertTrue(min_values["key1"] == 1 and min_values["key2"]
== "e"
and max_values["key1"] == 4 and
max_values["key2"] == "h")
elif split.partition.values == ["p2", 2]:
count += 1
- min_values = split.files[0].value_stats.min_values.to_dict()
- max_values = split.files[0].value_stats.max_values.to_dict()
+ min_values = split.files[0].key_stats.min_values.to_dict()
+ max_values = split.files[0].key_stats.max_values.to_dict()
self.assertTrue(min_values["key1"] == 5 and min_values["key2"]
== "a"
and max_values["key1"] == 8 and
max_values["key2"] == "d")
elif split.partition.values == ["p1", 1]:
count += 1
- min_values = split.files[0].value_stats.min_values.to_dict()
- max_values = split.files[0].value_stats.max_values.to_dict()
+ min_values = split.files[0].key_stats.min_values.to_dict()
+ max_values = split.files[0].key_stats.max_values.to_dict()
self.assertTrue(min_values["key1"] == max_values["key1"] == 7
and max_values["key2"] == max_values["key2"]
== "b")
self.assertEqual(count, 3)
diff --git a/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
b/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
index a184f64866..9be66d9759 100644
--- a/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
+++ b/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
@@ -180,7 +180,9 @@ class RESTAOReadWritePy36Test(RESTBaseTest):
latest_snapshot = SnapshotManager(table).get_latest_snapshot()
manifest_files =
table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot)
manifest_entries =
table_scan.starting_scanner.manifest_file_manager.read(
- manifest_files[0].file_name, lambda row:
table_scan.starting_scanner._filter_manifest_entry(row))
+ manifest_files[0].file_name,
+ lambda row:
table_scan.starting_scanner._filter_manifest_entry(row),
+ drop_stats=False)
min_value_stats =
manifest_entries[0].file.value_stats.min_values.values
max_value_stats =
manifest_entries[0].file.value_stats.max_values.values
expected_min_values = [col[0].as_py() for col in expect_data]
@@ -849,7 +851,7 @@ class RESTAOReadWritePy36Test(RESTBaseTest):
manifest_manager.write(manifest_file_name, [entry])
# Read the manifest entry back
- entries = manifest_manager.read(manifest_file_name)
+ entries = manifest_manager.read(manifest_file_name, drop_stats=False)
# Verify we have exactly one entry
self.assertEqual(len(entries), 1)
diff --git a/paimon-python/pypaimon/tests/reader_base_test.py
b/paimon-python/pypaimon/tests/reader_base_test.py
index d158c824ef..a06e120e95 100644
--- a/paimon-python/pypaimon/tests/reader_base_test.py
+++ b/paimon-python/pypaimon/tests/reader_base_test.py
@@ -210,14 +210,22 @@ class ReaderBasicTest(unittest.TestCase):
read_builder = table.new_read_builder()
table_scan = read_builder.new_scan()
table_read = read_builder.new_read()
- actual_data = table_read.to_arrow(table_scan.plan().splits())
+ splits = table_scan.plan().splits()
+
+ # assert data file without stats
+ first_file = splits[0].files[0]
+ self.assertEqual(first_file.value_stats_cols, [])
+ self.assertEqual(first_file.value_stats, SimpleStats.empty_stats())
+
+ # assert equal
+ actual_data = table_read.to_arrow(splits)
self.assertEqual(actual_data, expect_data)
# to test GenericRow ability
latest_snapshot = SnapshotManager(table).get_latest_snapshot()
manifest_files =
table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot)
manifest_entries =
table_scan.starting_scanner.manifest_file_manager.read(
- manifest_files[0].file_name, lambda row:
table_scan.starting_scanner._filter_manifest_entry(row))
+ manifest_files[0].file_name, lambda row:
table_scan.starting_scanner._filter_manifest_entry(row), False)
min_value_stats =
manifest_entries[0].file.value_stats.min_values.values
max_value_stats =
manifest_entries[0].file.value_stats.max_values.values
expected_min_values = [col[0].as_py() for col in expect_data]
@@ -627,7 +635,7 @@ class ReaderBasicTest(unittest.TestCase):
manifest_manager.write(manifest_file_name, [entry])
# Read the manifest entry back
- entries = manifest_manager.read(manifest_file_name)
+ entries = manifest_manager.read(manifest_file_name, drop_stats=False)
# Verify we have exactly one entry
self.assertEqual(len(entries), 1)