This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 4c875333f9 [python] fix reading on legacy manifest list without
_MIN_ROW_ID/_MAX_ROW_ID (#7248)
4c875333f9 is described below
commit 4c875333f9e7ee3bfdca645b36545ff8a40fafba
Author: XiaoHongbo <[email protected]>
AuthorDate: Tue Feb 10 18:26:50 2026 +0800
[python] fix reading on legacy manifest list without
_MIN_ROW_ID/_MAX_ROW_ID (#7248)
---
.../pypaimon/manifest/manifest_list_manager.py | 4 +-
.../tests/manifest/manifest_schema_test.py | 88 +++++++++++++++++++++-
2 files changed, 87 insertions(+), 5 deletions(-)
diff --git a/paimon-python/pypaimon/manifest/manifest_list_manager.py
b/paimon-python/pypaimon/manifest/manifest_list_manager.py
index 81f1714b2b..0bc0cc2b88 100644
--- a/paimon-python/pypaimon/manifest/manifest_list_manager.py
+++ b/paimon-python/pypaimon/manifest/manifest_list_manager.py
@@ -81,8 +81,8 @@ class ManifestListManager:
num_deleted_files=record['_NUM_DELETED_FILES'],
partition_stats=partition_stats,
schema_id=record['_SCHEMA_ID'],
- min_row_id=record['_MIN_ROW_ID'],
- max_row_id=record['_MAX_ROW_ID'],
+ min_row_id=record.get('_MIN_ROW_ID'),
+ max_row_id=record.get('_MAX_ROW_ID'),
)
manifest_files.append(manifest_file_meta)
diff --git a/paimon-python/pypaimon/tests/manifest/manifest_schema_test.py
b/paimon-python/pypaimon/tests/manifest/manifest_schema_test.py
index 13890f783c..a6076998d4 100644
--- a/paimon-python/pypaimon/tests/manifest/manifest_schema_test.py
+++ b/paimon-python/pypaimon/tests/manifest/manifest_schema_test.py
@@ -15,18 +15,44 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
+import os
+import tempfile
import unittest
+from io import BytesIO
+from unittest.mock import Mock
-from pypaimon.manifest.schema import data_file_meta
+import fastavro
+from pypaimon.common.options import Options
+from pypaimon.filesystem.local_file_io import LocalFileIO
+from pypaimon.manifest.manifest_list_manager import ManifestListManager
+from pypaimon.manifest.schema import data_file_meta
from pypaimon.manifest.schema.data_file_meta import DATA_FILE_META_SCHEMA
from pypaimon.manifest.schema.manifest_file_meta import
MANIFEST_FILE_META_SCHEMA
from pypaimon.manifest.schema.simple_stats import (
KEY_STATS_SCHEMA,
VALUE_STATS_SCHEMA,
- PARTITION_STATS_SCHEMA
+ PARTITION_STATS_SCHEMA,
)
+LEGACY_MANIFEST_FILE_META_SCHEMA = {
+ "type": "record",
+ "name": "ManifestFileMeta",
+ "fields": [
+ {"name": "_VERSION", "type": "int"},
+ {"name": "_FILE_NAME", "type": "string"},
+ {"name": "_FILE_SIZE", "type": "long"},
+ {"name": "_NUM_ADDED_FILES", "type": "long"},
+ {"name": "_NUM_DELETED_FILES", "type": "long"},
+ {"name": "_PARTITION_STATS", "type": PARTITION_STATS_SCHEMA},
+ {"name": "_SCHEMA_ID", "type": "long"},
+ ],
+}
+
+
+def _empty_partition_stats_bytes():
+ return b"\x00\x00\x00\x00\x00"
+
class ManifestSchemaTest(unittest.TestCase):
def test_file_source_field_type_and_default(self):
@@ -103,7 +129,8 @@ class ManifestSchemaTest(unittest.TestCase):
# Check that all expected fields are present
expected_fields = [
"_VERSION", "_FILE_NAME", "_FILE_SIZE", "_NUM_ADDED_FILES",
- "_NUM_DELETED_FILES", "_PARTITION_STATS", "_SCHEMA_ID"
+ "_NUM_DELETED_FILES", "_PARTITION_STATS", "_SCHEMA_ID",
+ "_MIN_ROW_ID", "_MAX_ROW_ID",
]
for field_name in expected_fields:
@@ -117,6 +144,16 @@ class ManifestSchemaTest(unittest.TestCase):
self.assertEqual(field_map["_NUM_DELETED_FILES"]["type"], "long")
self.assertEqual(field_map["_PARTITION_STATS"]["type"],
PARTITION_STATS_SCHEMA)
self.assertEqual(field_map["_SCHEMA_ID"]["type"], "long")
+ self.assertEqual(field_map["_MIN_ROW_ID"]["type"], ["null", "long"])
+ self.assertEqual(field_map["_MAX_ROW_ID"]["type"], ["null", "long"])
+ self.assertIsNone(
+ field_map["_MIN_ROW_ID"].get("default"),
+ "_MIN_ROW_ID should have default None for backward compatibility",
+ )
+ self.assertIsNone(
+ field_map["_MAX_ROW_ID"].get("default"),
+ "_MAX_ROW_ID should have default None for backward compatibility",
+ )
def test_schema_references(self):
"""Test that schema references are correctly used."""
@@ -149,3 +186,48 @@ class ManifestSchemaTest(unittest.TestCase):
PARTITION_STATS_SCHEMA["name"]
]
self.assertEqual(len(names), len(set(names)), "Schema names should be
unique")
+
+ def test_read_legacy_manifest_list(self):
+ temp_dir = tempfile.mkdtemp(prefix="manifest_schema_test_")
+ table_path = f"file://{temp_dir}"
+ file_io = LocalFileIO(table_path, Options({}))
+ os.makedirs(os.path.join(temp_dir, "manifest"), exist_ok=True)
+ manifest_list_name = "legacy-manifest-list-avro"
+
+ legacy_record = {
+ "_VERSION": 2,
+ "_FILE_NAME": "data-12345.parquet",
+ "_FILE_SIZE": 1000,
+ "_NUM_ADDED_FILES": 1,
+ "_NUM_DELETED_FILES": 0,
+ "_PARTITION_STATS": {
+ "_MIN_VALUES": _empty_partition_stats_bytes(),
+ "_MAX_VALUES": _empty_partition_stats_bytes(),
+ "_NULL_COUNTS": None,
+ },
+ "_SCHEMA_ID": 0,
+ }
+ with self.assertRaises(KeyError):
+ _ = legacy_record["_MIN_ROW_ID"]
+ self.assertIsNone(legacy_record.get("_MIN_ROW_ID"))
+ self.assertIsNone(legacy_record.get("_MAX_ROW_ID"))
+
+ buffer = BytesIO()
+ fastavro.writer(buffer, LEGACY_MANIFEST_FILE_META_SCHEMA,
[legacy_record])
+ with
file_io.new_output_stream(f"{table_path}/manifest/{manifest_list_name}") as out:
+ out.write(buffer.getvalue())
+
+ table = Mock()
+ table.table_path = table_path
+ table.file_io = file_io
+ table.partition_keys_fields = []
+ manager = ManifestListManager(table)
+ metas = manager.read(manifest_list_name)
+
+ self.assertEqual(len(metas), 1)
+ meta = metas[0]
+ self.assertEqual(meta.file_name, "data-12345.parquet")
+ self.assertEqual(meta.file_size, 1000)
+ self.assertEqual(meta.schema_id, 0)
+ self.assertIsNone(meta.min_row_id)
+ self.assertIsNone(meta.max_row_id)