This is an automated email from the ASF dual-hosted git repository. lzljs3620320 pushed a commit to branch release-1.3 in repository https://gitbox.apache.org/repos/asf/paimon.git
commit e4ecfac05ccd1a952ce7fb6ea5cf5431c804c099 Author: Zhang Jiawei <[email protected]> AuthorDate: Fri Nov 7 13:37:29 2025 +0800 [Python] Remove the use of reference types in DATA_FILE_META_SCHEMA (#6549) --- .../pypaimon/manifest/schema/data_file_meta.py | 6 +- .../pypaimon/manifest/schema/manifest_file_meta.py | 4 +- .../pypaimon/manifest/schema/simple_stats.py | 44 +++--- .../pypaimon/tests/manifest_schema_test.py | 147 +++++++++++++++++++++ 4 files changed, 181 insertions(+), 20 deletions(-) diff --git a/paimon-python/pypaimon/manifest/schema/data_file_meta.py b/paimon-python/pypaimon/manifest/schema/data_file_meta.py index 8206061c84..a414644e27 100644 --- a/paimon-python/pypaimon/manifest/schema/data_file_meta.py +++ b/paimon-python/pypaimon/manifest/schema/data_file_meta.py @@ -21,7 +21,7 @@ from datetime import datetime from pathlib import Path from typing import List, Optional -from pypaimon.manifest.schema.simple_stats import (SIMPLE_STATS_SCHEMA, +from pypaimon.manifest.schema.simple_stats import (KEY_STATS_SCHEMA, VALUE_STATS_SCHEMA, SimpleStats) from pypaimon.table.row.generic_row import GenericRow @@ -149,8 +149,8 @@ DATA_FILE_META_SCHEMA = { {"name": "_ROW_COUNT", "type": "long"}, {"name": "_MIN_KEY", "type": "bytes"}, {"name": "_MAX_KEY", "type": "bytes"}, - {"name": "_KEY_STATS", "type": SIMPLE_STATS_SCHEMA}, - {"name": "_VALUE_STATS", "type": "SimpleStats"}, + {"name": "_KEY_STATS", "type": KEY_STATS_SCHEMA}, + {"name": "_VALUE_STATS", "type": VALUE_STATS_SCHEMA}, {"name": "_MIN_SEQUENCE_NUMBER", "type": "long"}, {"name": "_MAX_SEQUENCE_NUMBER", "type": "long"}, {"name": "_SCHEMA_ID", "type": "long"}, diff --git a/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py b/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py index 443c6a0944..a830be6cc2 100644 --- a/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py +++ b/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py @@ -18,7 +18,7 @@ from dataclasses import dataclass -from pypaimon.manifest.schema.simple_stats import (SIMPLE_STATS_SCHEMA, +from pypaimon.manifest.schema.simple_stats import (PARTITION_STATS_SCHEMA, SimpleStats) @@ -41,7 +41,7 @@ MANIFEST_FILE_META_SCHEMA = { {"name": "_FILE_SIZE", "type": "long"}, {"name": "_NUM_ADDED_FILES", "type": "long"}, {"name": "_NUM_DELETED_FILES", "type": "long"}, - {"name": "_PARTITION_STATS", "type": SIMPLE_STATS_SCHEMA}, + {"name": "_PARTITION_STATS", "type": PARTITION_STATS_SCHEMA}, {"name": "_SCHEMA_ID", "type": "long"}, ] } diff --git a/paimon-python/pypaimon/manifest/schema/simple_stats.py b/paimon-python/pypaimon/manifest/schema/simple_stats.py index 7519f881c3..10d9e62420 100644 --- a/paimon-python/pypaimon/manifest/schema/simple_stats.py +++ b/paimon-python/pypaimon/manifest/schema/simple_stats.py @@ -41,20 +41,34 @@ class SimpleStats: return cls._empty_stats -SIMPLE_STATS_SCHEMA = { +SIMPLE_STATS_FIELDS = [ + {"name": "_MIN_VALUES", "type": "bytes"}, + {"name": "_MAX_VALUES", "type": "bytes"}, + {"name": "_NULL_COUNTS", + "type": [ + "null", + { + "type": "array", + "items": ["null", "long"] + } + ], + "default": None}, +] + +KEY_STATS_SCHEMA = { + "type": "record", + "name": "record_KEY_STATS", + "fields": SIMPLE_STATS_FIELDS +} + +VALUE_STATS_SCHEMA = { + "type": "record", + "name": "record_VALUE_STATS", + "fields": SIMPLE_STATS_FIELDS +} + +PARTITION_STATS_SCHEMA = { "type": "record", - "name": "SimpleStats", - "fields": [ - {"name": "_MIN_VALUES", "type": "bytes"}, - {"name": "_MAX_VALUES", "type": "bytes"}, - {"name": "_NULL_COUNTS", - "type": [ - "null", - { - "type": "array", - "items": ["null", "long"] - } - ], - "default": None}, - ] + "name": "record_PARTITION_STATS", + "fields": SIMPLE_STATS_FIELDS } diff --git a/paimon-python/pypaimon/tests/manifest_schema_test.py b/paimon-python/pypaimon/tests/manifest_schema_test.py new file mode 100644 index 0000000000..3d82ededa6 --- /dev/null +++ b/paimon-python/pypaimon/tests/manifest_schema_test.py @@ -0,0 +1,147 @@ +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest + +from pypaimon.manifest.schema.data_file_meta import DATA_FILE_META_SCHEMA +from pypaimon.manifest.schema.manifest_file_meta import MANIFEST_FILE_META_SCHEMA +from pypaimon.manifest.schema.simple_stats import ( + KEY_STATS_SCHEMA, + VALUE_STATS_SCHEMA, + PARTITION_STATS_SCHEMA +) + + +class ManifestSchemaTest(unittest.TestCase): + """Test cases for the manifest schema definitions.""" + + def test_data_file_meta_schema_structure(self): + """Test that DATA_FILE_META_SCHEMA has the correct structure.""" + self.assertIsInstance(DATA_FILE_META_SCHEMA, dict) + self.assertEqual(DATA_FILE_META_SCHEMA["type"], "record") + self.assertEqual(DATA_FILE_META_SCHEMA["name"], "DataFileMeta") + self.assertIn("fields", DATA_FILE_META_SCHEMA) + + fields = DATA_FILE_META_SCHEMA["fields"] + self.assertIsInstance(fields, list) + + # Create a mapping of field names to field definitions for easier testing + field_map = {field["name"]: field for field in fields} + + # Check that all expected fields are present + expected_fields = [ + "_FILE_NAME", "_FILE_SIZE", "_ROW_COUNT", "_MIN_KEY", "_MAX_KEY", + "_KEY_STATS", "_VALUE_STATS", "_MIN_SEQUENCE_NUMBER", + "_MAX_SEQUENCE_NUMBER", "_SCHEMA_ID", "_LEVEL", "_EXTRA_FILES", + "_CREATION_TIME", "_DELETE_ROW_COUNT", "_EMBEDDED_FILE_INDEX", + "_FILE_SOURCE", "_VALUE_STATS_COLS", "_EXTERNAL_PATH", + "_FIRST_ROW_ID", "_WRITE_COLS" + ] + + for field_name in expected_fields: + self.assertIn(field_name, field_map, f"Field {field_name} is missing") + + # Check specific field types + self.assertEqual(field_map["_FILE_NAME"]["type"], "string") + self.assertEqual(field_map["_FILE_SIZE"]["type"], "long") + self.assertEqual(field_map["_ROW_COUNT"]["type"], "long") + self.assertEqual(field_map["_MIN_KEY"]["type"], "bytes") + self.assertEqual(field_map["_MAX_KEY"]["type"], "bytes") + self.assertEqual(field_map["_KEY_STATS"]["type"], KEY_STATS_SCHEMA) + self.assertEqual(field_map["_VALUE_STATS"]["type"], VALUE_STATS_SCHEMA) + self.assertEqual(field_map["_MIN_SEQUENCE_NUMBER"]["type"], "long") + self.assertEqual(field_map["_MAX_SEQUENCE_NUMBER"]["type"], "long") + self.assertEqual(field_map["_SCHEMA_ID"]["type"], "long") + self.assertEqual(field_map["_LEVEL"]["type"], "int") + self.assertEqual(field_map["_EXTRA_FILES"]["type"], {"type": "array", "items": "string"}) + self.assertEqual(field_map["_CREATION_TIME"]["type"], + ["null", {"type": "long", "logicalType": "timestamp-millis"}]) + self.assertEqual(field_map["_DELETE_ROW_COUNT"]["type"], ["null", "long"]) + self.assertEqual(field_map["_EMBEDDED_FILE_INDEX"]["type"], ["null", "bytes"]) + self.assertEqual(field_map["_FILE_SOURCE"]["type"], ["null", "string"]) + self.assertEqual(field_map["_VALUE_STATS_COLS"]["type"], ["null", {"type": "array", "items": "string"}]) + self.assertEqual(field_map["_EXTERNAL_PATH"]["type"], ["null", "string"]) + self.assertEqual(field_map["_FIRST_ROW_ID"]["type"], ["null", "long"]) + self.assertEqual(field_map["_WRITE_COLS"]["type"], ["null", {"type": "array", "items": "string"}]) + + def test_manifest_file_meta_schema_structure(self): + """Test that MANIFEST_FILE_META_SCHEMA has the correct structure.""" + self.assertIsInstance(MANIFEST_FILE_META_SCHEMA, dict) + self.assertEqual(MANIFEST_FILE_META_SCHEMA["type"], "record") + self.assertEqual(MANIFEST_FILE_META_SCHEMA["name"], "ManifestFileMeta") + self.assertIn("fields", MANIFEST_FILE_META_SCHEMA) + + fields = MANIFEST_FILE_META_SCHEMA["fields"] + self.assertIsInstance(fields, list) + + # Create a mapping of field names to field definitions for easier testing + field_map = {field["name"]: field for field in fields} + + # Check that all expected fields are present + expected_fields = [ + "_VERSION", "_FILE_NAME", "_FILE_SIZE", "_NUM_ADDED_FILES", + "_NUM_DELETED_FILES", "_PARTITION_STATS", "_SCHEMA_ID" + ] + + for field_name in expected_fields: + self.assertIn(field_name, field_map, f"Field {field_name} is missing") + + # Check specific field types + self.assertEqual(field_map["_VERSION"]["type"], "int") + self.assertEqual(field_map["_FILE_NAME"]["type"], "string") + self.assertEqual(field_map["_FILE_SIZE"]["type"], "long") + self.assertEqual(field_map["_NUM_ADDED_FILES"]["type"], "long") + self.assertEqual(field_map["_NUM_DELETED_FILES"]["type"], "long") + self.assertEqual(field_map["_PARTITION_STATS"]["type"], PARTITION_STATS_SCHEMA) + self.assertEqual(field_map["_SCHEMA_ID"]["type"], "long") + + def test_schema_references(self): + """Test that schema references are correctly used.""" + data_file_fields = {field["name"]: field for field in DATA_FILE_META_SCHEMA["fields"]} + manifest_file_fields = {field["name"]: field for field in MANIFEST_FILE_META_SCHEMA["fields"]} + + # Check that _KEY_STATS references KEY_STATS_SCHEMA + key_stats_field = data_file_fields["_KEY_STATS"] + self.assertEqual(key_stats_field["type"], KEY_STATS_SCHEMA) + + # Check that _VALUE_STATS references VALUE_STATS_SCHEMA + value_stats_field = data_file_fields["_VALUE_STATS"] + self.assertEqual(value_stats_field["type"], VALUE_STATS_SCHEMA) + + # Check that _PARTITION_STATS references PARTITION_STATS_SCHEMA + partition_stats_field = manifest_file_fields["_PARTITION_STATS"] + self.assertEqual(partition_stats_field["type"], PARTITION_STATS_SCHEMA) + + def test_schema_consistency(self): + """Test that schema definitions are consistent.""" + # Verify that all stats schemas have the same structure + self.assertEqual(KEY_STATS_SCHEMA["type"], "record") + self.assertEqual(VALUE_STATS_SCHEMA["type"], "record") + self.assertEqual(PARTITION_STATS_SCHEMA["type"], "record") + + # Verify that all stats schemas have different names + names = [ + KEY_STATS_SCHEMA["name"], + VALUE_STATS_SCHEMA["name"], + PARTITION_STATS_SCHEMA["name"] + ] + self.assertEqual(len(names), len(set(names)), "Schema names should be unique") + + +if __name__ == "__main__": + unittest.main()
