(paimon) 11/18: [Python] Remove the use of reference types in DATA_FILE_META_SCHEMA (#6549)

lzljs3620320 Mon, 10 Nov 2025 21:41:11 -0800

This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch release-1.3
in repository https://gitbox.apache.org/repos/asf/paimon.git


commit e4ecfac05ccd1a952ce7fb6ea5cf5431c804c099
Author: Zhang Jiawei <[email protected]>
AuthorDate: Fri Nov 7 13:37:29 2025 +0800

    [Python] Remove the use of reference types in DATA_FILE_META_SCHEMA (#6549)
---
 .../pypaimon/manifest/schema/data_file_meta.py     |   6 +-
 .../pypaimon/manifest/schema/manifest_file_meta.py |   4 +-
 .../pypaimon/manifest/schema/simple_stats.py       |  44 +++---
 .../pypaimon/tests/manifest_schema_test.py         | 147 +++++++++++++++++++++
 4 files changed, 181 insertions(+), 20 deletions(-)

diff --git a/paimon-python/pypaimon/manifest/schema/data_file_meta.py 
b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
index 8206061c84..a414644e27 100644
--- a/paimon-python/pypaimon/manifest/schema/data_file_meta.py
+++ b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
@@ -21,7 +21,7 @@ from datetime import datetime
 from pathlib import Path
 from typing import List, Optional
 
-from pypaimon.manifest.schema.simple_stats import (SIMPLE_STATS_SCHEMA,
+from pypaimon.manifest.schema.simple_stats import (KEY_STATS_SCHEMA, 
VALUE_STATS_SCHEMA,
                                                    SimpleStats)
 from pypaimon.table.row.generic_row import GenericRow
 
@@ -149,8 +149,8 @@ DATA_FILE_META_SCHEMA = {
         {"name": "_ROW_COUNT", "type": "long"},
         {"name": "_MIN_KEY", "type": "bytes"},
         {"name": "_MAX_KEY", "type": "bytes"},
-        {"name": "_KEY_STATS", "type": SIMPLE_STATS_SCHEMA},
-        {"name": "_VALUE_STATS", "type": "SimpleStats"},
+        {"name": "_KEY_STATS", "type": KEY_STATS_SCHEMA},
+        {"name": "_VALUE_STATS", "type": VALUE_STATS_SCHEMA},
         {"name": "_MIN_SEQUENCE_NUMBER", "type": "long"},
         {"name": "_MAX_SEQUENCE_NUMBER", "type": "long"},
         {"name": "_SCHEMA_ID", "type": "long"},
diff --git a/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py 
b/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py
index 443c6a0944..a830be6cc2 100644
--- a/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py
+++ b/paimon-python/pypaimon/manifest/schema/manifest_file_meta.py
@@ -18,7 +18,7 @@
 
 from dataclasses import dataclass
 
-from pypaimon.manifest.schema.simple_stats import (SIMPLE_STATS_SCHEMA,
+from pypaimon.manifest.schema.simple_stats import (PARTITION_STATS_SCHEMA,
                                                    SimpleStats)
 
 
@@ -41,7 +41,7 @@ MANIFEST_FILE_META_SCHEMA = {
         {"name": "_FILE_SIZE", "type": "long"},
         {"name": "_NUM_ADDED_FILES", "type": "long"},
         {"name": "_NUM_DELETED_FILES", "type": "long"},
-        {"name": "_PARTITION_STATS", "type": SIMPLE_STATS_SCHEMA},
+        {"name": "_PARTITION_STATS", "type": PARTITION_STATS_SCHEMA},
         {"name": "_SCHEMA_ID", "type": "long"},
     ]
 }
diff --git a/paimon-python/pypaimon/manifest/schema/simple_stats.py 
b/paimon-python/pypaimon/manifest/schema/simple_stats.py
index 7519f881c3..10d9e62420 100644
--- a/paimon-python/pypaimon/manifest/schema/simple_stats.py
+++ b/paimon-python/pypaimon/manifest/schema/simple_stats.py
@@ -41,20 +41,34 @@ class SimpleStats:
         return cls._empty_stats
 
 
-SIMPLE_STATS_SCHEMA = {
+SIMPLE_STATS_FIELDS = [
+    {"name": "_MIN_VALUES", "type": "bytes"},
+    {"name": "_MAX_VALUES", "type": "bytes"},
+    {"name": "_NULL_COUNTS",
+     "type": [
+         "null",
+         {
+             "type": "array",
+             "items": ["null", "long"]
+         }
+     ],
+     "default": None},
+]
+
+KEY_STATS_SCHEMA = {
+    "type": "record",
+    "name": "record_KEY_STATS",
+    "fields": SIMPLE_STATS_FIELDS
+}
+
+VALUE_STATS_SCHEMA = {
+    "type": "record",
+    "name": "record_VALUE_STATS",
+    "fields": SIMPLE_STATS_FIELDS
+}
+
+PARTITION_STATS_SCHEMA = {
     "type": "record",
-    "name": "SimpleStats",
-    "fields": [
-        {"name": "_MIN_VALUES", "type": "bytes"},
-        {"name": "_MAX_VALUES", "type": "bytes"},
-        {"name": "_NULL_COUNTS",
-         "type": [
-             "null",
-             {
-                 "type": "array",
-                 "items": ["null", "long"]
-             }
-         ],
-         "default": None},
-    ]
+    "name": "record_PARTITION_STATS",
+    "fields": SIMPLE_STATS_FIELDS
 }
diff --git a/paimon-python/pypaimon/tests/manifest_schema_test.py 
b/paimon-python/pypaimon/tests/manifest_schema_test.py
new file mode 100644
index 0000000000..3d82ededa6
--- /dev/null
+++ b/paimon-python/pypaimon/tests/manifest_schema_test.py
@@ -0,0 +1,147 @@
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+
+from pypaimon.manifest.schema.data_file_meta import DATA_FILE_META_SCHEMA
+from pypaimon.manifest.schema.manifest_file_meta import 
MANIFEST_FILE_META_SCHEMA
+from pypaimon.manifest.schema.simple_stats import (
+    KEY_STATS_SCHEMA,
+    VALUE_STATS_SCHEMA,
+    PARTITION_STATS_SCHEMA
+)
+
+
+class ManifestSchemaTest(unittest.TestCase):
+    """Test cases for the manifest schema definitions."""
+
+    def test_data_file_meta_schema_structure(self):
+        """Test that DATA_FILE_META_SCHEMA has the correct structure."""
+        self.assertIsInstance(DATA_FILE_META_SCHEMA, dict)
+        self.assertEqual(DATA_FILE_META_SCHEMA["type"], "record")
+        self.assertEqual(DATA_FILE_META_SCHEMA["name"], "DataFileMeta")
+        self.assertIn("fields", DATA_FILE_META_SCHEMA)
+
+        fields = DATA_FILE_META_SCHEMA["fields"]
+        self.assertIsInstance(fields, list)
+
+        # Create a mapping of field names to field definitions for easier 
testing
+        field_map = {field["name"]: field for field in fields}
+
+        # Check that all expected fields are present
+        expected_fields = [
+            "_FILE_NAME", "_FILE_SIZE", "_ROW_COUNT", "_MIN_KEY", "_MAX_KEY",
+            "_KEY_STATS", "_VALUE_STATS", "_MIN_SEQUENCE_NUMBER",
+            "_MAX_SEQUENCE_NUMBER", "_SCHEMA_ID", "_LEVEL", "_EXTRA_FILES",
+            "_CREATION_TIME", "_DELETE_ROW_COUNT", "_EMBEDDED_FILE_INDEX",
+            "_FILE_SOURCE", "_VALUE_STATS_COLS", "_EXTERNAL_PATH",
+            "_FIRST_ROW_ID", "_WRITE_COLS"
+        ]
+
+        for field_name in expected_fields:
+            self.assertIn(field_name, field_map, f"Field {field_name} is 
missing")
+
+        # Check specific field types
+        self.assertEqual(field_map["_FILE_NAME"]["type"], "string")
+        self.assertEqual(field_map["_FILE_SIZE"]["type"], "long")
+        self.assertEqual(field_map["_ROW_COUNT"]["type"], "long")
+        self.assertEqual(field_map["_MIN_KEY"]["type"], "bytes")
+        self.assertEqual(field_map["_MAX_KEY"]["type"], "bytes")
+        self.assertEqual(field_map["_KEY_STATS"]["type"], KEY_STATS_SCHEMA)
+        self.assertEqual(field_map["_VALUE_STATS"]["type"], VALUE_STATS_SCHEMA)
+        self.assertEqual(field_map["_MIN_SEQUENCE_NUMBER"]["type"], "long")
+        self.assertEqual(field_map["_MAX_SEQUENCE_NUMBER"]["type"], "long")
+        self.assertEqual(field_map["_SCHEMA_ID"]["type"], "long")
+        self.assertEqual(field_map["_LEVEL"]["type"], "int")
+        self.assertEqual(field_map["_EXTRA_FILES"]["type"], {"type": "array", 
"items": "string"})
+        self.assertEqual(field_map["_CREATION_TIME"]["type"],
+                         ["null", {"type": "long", "logicalType": 
"timestamp-millis"}])
+        self.assertEqual(field_map["_DELETE_ROW_COUNT"]["type"], ["null", 
"long"])
+        self.assertEqual(field_map["_EMBEDDED_FILE_INDEX"]["type"], ["null", 
"bytes"])
+        self.assertEqual(field_map["_FILE_SOURCE"]["type"], ["null", "string"])
+        self.assertEqual(field_map["_VALUE_STATS_COLS"]["type"], ["null", 
{"type": "array", "items": "string"}])
+        self.assertEqual(field_map["_EXTERNAL_PATH"]["type"], ["null", 
"string"])
+        self.assertEqual(field_map["_FIRST_ROW_ID"]["type"], ["null", "long"])
+        self.assertEqual(field_map["_WRITE_COLS"]["type"], ["null", {"type": 
"array", "items": "string"}])
+
+    def test_manifest_file_meta_schema_structure(self):
+        """Test that MANIFEST_FILE_META_SCHEMA has the correct structure."""
+        self.assertIsInstance(MANIFEST_FILE_META_SCHEMA, dict)
+        self.assertEqual(MANIFEST_FILE_META_SCHEMA["type"], "record")
+        self.assertEqual(MANIFEST_FILE_META_SCHEMA["name"], "ManifestFileMeta")
+        self.assertIn("fields", MANIFEST_FILE_META_SCHEMA)
+
+        fields = MANIFEST_FILE_META_SCHEMA["fields"]
+        self.assertIsInstance(fields, list)
+
+        # Create a mapping of field names to field definitions for easier 
testing
+        field_map = {field["name"]: field for field in fields}
+
+        # Check that all expected fields are present
+        expected_fields = [
+            "_VERSION", "_FILE_NAME", "_FILE_SIZE", "_NUM_ADDED_FILES",
+            "_NUM_DELETED_FILES", "_PARTITION_STATS", "_SCHEMA_ID"
+        ]
+
+        for field_name in expected_fields:
+            self.assertIn(field_name, field_map, f"Field {field_name} is 
missing")
+
+        # Check specific field types
+        self.assertEqual(field_map["_VERSION"]["type"], "int")
+        self.assertEqual(field_map["_FILE_NAME"]["type"], "string")
+        self.assertEqual(field_map["_FILE_SIZE"]["type"], "long")
+        self.assertEqual(field_map["_NUM_ADDED_FILES"]["type"], "long")
+        self.assertEqual(field_map["_NUM_DELETED_FILES"]["type"], "long")
+        self.assertEqual(field_map["_PARTITION_STATS"]["type"], 
PARTITION_STATS_SCHEMA)
+        self.assertEqual(field_map["_SCHEMA_ID"]["type"], "long")
+
+    def test_schema_references(self):
+        """Test that schema references are correctly used."""
+        data_file_fields = {field["name"]: field for field in 
DATA_FILE_META_SCHEMA["fields"]}
+        manifest_file_fields = {field["name"]: field for field in 
MANIFEST_FILE_META_SCHEMA["fields"]}
+
+        # Check that _KEY_STATS references KEY_STATS_SCHEMA
+        key_stats_field = data_file_fields["_KEY_STATS"]
+        self.assertEqual(key_stats_field["type"], KEY_STATS_SCHEMA)
+
+        # Check that _VALUE_STATS references VALUE_STATS_SCHEMA
+        value_stats_field = data_file_fields["_VALUE_STATS"]
+        self.assertEqual(value_stats_field["type"], VALUE_STATS_SCHEMA)
+
+        # Check that _PARTITION_STATS references PARTITION_STATS_SCHEMA
+        partition_stats_field = manifest_file_fields["_PARTITION_STATS"]
+        self.assertEqual(partition_stats_field["type"], PARTITION_STATS_SCHEMA)
+
+    def test_schema_consistency(self):
+        """Test that schema definitions are consistent."""
+        # Verify that all stats schemas have the same structure
+        self.assertEqual(KEY_STATS_SCHEMA["type"], "record")
+        self.assertEqual(VALUE_STATS_SCHEMA["type"], "record")
+        self.assertEqual(PARTITION_STATS_SCHEMA["type"], "record")
+
+        # Verify that all stats schemas have different names
+        names = [
+            KEY_STATS_SCHEMA["name"],
+            VALUE_STATS_SCHEMA["name"],
+            PARTITION_STATS_SCHEMA["name"]
+        ]
+        self.assertEqual(len(names), len(set(names)), "Schema names should be 
unique")
+
+
+if __name__ == "__main__":
+    unittest.main()

(paimon) 11/18: [Python] Remove the use of reference types in DATA_FILE_META_SCHEMA (#6549)

Reply via email to