This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 6e7481ea95 [python] Fix string size being an integer multiple of bytes
(#6225)
6e7481ea95 is described below
commit 6e7481ea95bfec3fa6d89b3850d1b3653cd85084
Author: umi <[email protected]>
AuthorDate: Tue Sep 9 23:54:44 2025 +0800
[python] Fix string size being an integer multiple of bytes (#6225)
---
paimon-python/pypaimon/table/row/binary_row.py | 14 ++++++++--
.../pypaimon/tests/py36/ao_read_write_test.py | 30 ++++++++++++++++++++++
paimon-python/pypaimon/tests/reader_basic_test.py | 30 ++++++++++++++++++++++
3 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/paimon-python/pypaimon/table/row/binary_row.py
b/paimon-python/pypaimon/table/row/binary_row.py
index 468556dcb0..d52e38cf11 100644
--- a/paimon-python/pypaimon/table/row/binary_row.py
+++ b/paimon-python/pypaimon/table/row/binary_row.py
@@ -276,9 +276,11 @@ class BinaryRowSerializer:
header_byte = 0x80 | length
fixed_part[field_fixed_offset + 7] = header_byte
else:
+ var_length =
cls._round_number_of_bytes_to_nearest_word(len(value_bytes))
+ var_value_bytes = value_bytes + b'\x00' * (var_length -
length)
offset_in_variable_part = current_variable_offset
- variable_part_data.append(value_bytes)
- current_variable_offset += length
+ variable_part_data.append(var_value_bytes)
+ current_variable_offset += var_length
absolute_offset = fixed_part_size + offset_in_variable_part
offset_and_len = (absolute_offset << 32) | length
@@ -401,3 +403,11 @@ class BinaryRowSerializer:
else:
millis = value.hour * 3600000 + value.minute * 60000 +
value.second * 1000 + value.microsecond // 1000
return struct.pack('<i', millis)
+
+ @classmethod
+ def _round_number_of_bytes_to_nearest_word(cls, num_bytes: int) -> int:
+ remainder = num_bytes & 0x07
+ if remainder == 0:
+ return num_bytes
+ else:
+ return num_bytes + (8 - remainder)
diff --git a/paimon-python/pypaimon/tests/py36/ao_read_write_test.py
b/paimon-python/pypaimon/tests/py36/ao_read_write_test.py
index 7ae847c9f3..de174e6413 100644
--- a/paimon-python/pypaimon/tests/py36/ao_read_write_test.py
+++ b/paimon-python/pypaimon/tests/py36/ao_read_write_test.py
@@ -19,6 +19,11 @@ import logging
import pandas as pd
import pyarrow as pa
+from pypaimon.schema.data_types import DataField, AtomicType
+
+from pypaimon.table.row.row_kind import RowKind
+
+from pypaimon.table.row.binary_row import BinaryRow, BinaryRowSerializer,
BinaryRowDeserializer
from pypaimon.api.options import Options
from pypaimon.catalog.catalog_context import CatalogContext
@@ -364,3 +369,28 @@ class RESTTableReadWritePy36Test(RESTCatalogBaseTest):
table_read = read_builder.new_read()
splits = read_builder.new_scan().plan().splits()
self.assertEqual(table_read.to_arrow(splits).num_rows, total_rows)
+
+ def test_to_bytes_with_long_string(self):
+ """Test serialization of strings longer than 7 bytes which require
variable part storage."""
+ # Create fields with a long string value
+ fields = [
+ DataField(0, "long_string", AtomicType("STRING")),
+ ]
+
+ # String longer than 7 bytes will be stored in variable part
+ long_string = "This is a long string that exceeds 7 bytes"
+ values = [long_string]
+
+ binary_row = BinaryRow(values, fields, RowKind.INSERT)
+ serialized_bytes = BinaryRowSerializer.to_bytes(binary_row)
+
+ # Verify the last 6 bytes are 0
+ # This is because the variable part data is rounded to the nearest
word (8 bytes)
+ # The last 6 bytes check is to ensure proper padding
+ self.assertEqual(serialized_bytes[-6:], b'\x00\x00\x00\x00\x00\x00')
+ self.assertEqual(serialized_bytes[20:62].decode('utf-8'), long_string)
+ # Deserialize to verify
+ deserialized_row = BinaryRowDeserializer.from_bytes(serialized_bytes,
fields)
+
+ self.assertEqual(deserialized_row.values[0], long_string)
+ self.assertEqual(deserialized_row.row_kind, RowKind.INSERT)
diff --git a/paimon-python/pypaimon/tests/reader_basic_test.py
b/paimon-python/pypaimon/tests/reader_basic_test.py
index 445a65763f..9402ae93f7 100644
--- a/paimon-python/pypaimon/tests/reader_basic_test.py
+++ b/paimon-python/pypaimon/tests/reader_basic_test.py
@@ -23,6 +23,11 @@ import unittest
import pandas as pd
import pyarrow as pa
+from pypaimon.table.row.row_kind import RowKind
+
+from pypaimon.table.row.binary_row import BinaryRow, BinaryRowSerializer,
BinaryRowDeserializer
+
+from pypaimon.schema.data_types import DataField, AtomicType
from pypaimon.catalog.catalog_factory import CatalogFactory
from pypaimon.schema.schema import Schema
@@ -192,3 +197,28 @@ class ReaderBasicTest(unittest.TestCase):
actual = duckdb_con.query("SELECT * FROM duckdb_table").fetchdf()
expect = pd.DataFrame(self.raw_data)
pd.testing.assert_frame_equal(actual.reset_index(drop=True),
expect.reset_index(drop=True))
+
+ def test_to_bytes_with_long_string(self):
+ """Test serialization of strings longer than 7 bytes which require
variable part storage."""
+ # Create fields with a long string value
+ fields = [
+ DataField(0, "long_string", AtomicType("STRING")),
+ ]
+
+ # String longer than 7 bytes will be stored in variable part
+ long_string = "This is a long string that exceeds 7 bytes"
+ values = [long_string]
+
+ binary_row = BinaryRow(values, fields, RowKind.INSERT)
+ serialized_bytes = BinaryRowSerializer.to_bytes(binary_row)
+
+ # Verify the last 6 bytes are 0
+ # This is because the variable part data is rounded to the nearest
word (8 bytes)
+ # The last 6 bytes check is to ensure proper padding
+ self.assertEqual(serialized_bytes[-6:], b'\x00\x00\x00\x00\x00\x00')
+ self.assertEqual(serialized_bytes[20:62].decode('utf-8'), long_string)
+ # Deserialize to verify
+ deserialized_row = BinaryRowDeserializer.from_bytes(serialized_bytes,
fields)
+
+ self.assertEqual(deserialized_row.values[0], long_string)
+ self.assertEqual(deserialized_row.row_kind, RowKind.INSERT)