Re: [PR] GH-50012: [Python] Fix list_ storage crashes when values exceed int32 offsets [arrow]

via GitHub Mon, 25 May 2026 13:57:08 -0700


Copilot commented on code in PR #50016:
URL: https://github.com/apache/arrow/pull/50016#discussion_r3299919785



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -2120,3 +2120,80 @@ def test_json(storage_type, pickle_module):
                 pa.ArrowInvalid,
                 match=f"Invalid storage type for JsonExtensionType: 
{storage_type}"):
             pa.json_(storage_type)
+
+
+class ListExtensionType(pa.ExtensionType):
+    """Extension type with a list field for testing int32 overflow."""
+
+    def __init__(self):
+        super().__init__(
+            pa.struct({"data": pa.list_(pa.uint8())}),
+            "test_list_ext",
+        )
+
+    def __arrow_ext_serialize__(self):
+        return b""
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+        return cls()
+
+
[email protected]_memory
[email protected]
+def test_extension_type_list_overflow():
+    """
+    Test that extension types with list fields handle int32 offset overflow.
+    """
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Create data that exceeds int32 max cumulative values
+    # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
+    arr = np.zeros(500_000_000, dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(5)]
+
+    result = pa.Table.from_pylist(rows, schema=schema)
+
+    assert result.num_rows == 5
+    assert result.num_columns == 1
+    assert result.schema[0].type == ListExtensionType()
+
+    col = result.column(0)
+    assert isinstance(col, pa.ChunkedArray)
+    assert col.type == ListExtensionType()
+
+    for chunk_idx in range(col.num_chunks):
+        chunk_data = col.chunk(chunk_idx)
+        assert chunk_data.type == ListExtensionType()
+
+
[email protected]
+def test_extension_type_no_overflow():
+    """Test that extension types work normally when there's no overflow."""
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        # Already registered
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Small data that won't overflow
+    arr = np.array([1, 2, 3], dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(3)]
+
+    result = pa.Table.from_pylist(rows, schema=schema)
+
+    assert result.num_rows == 3
+    assert result.num_columns == 1
+    assert result.schema[0].type == ListExtensionType()
+
+    # The column should be a ChunkedArray with a single chunk
+    col = result.column(0)
+    assert isinstance(col, pa.ChunkedArray)
+    assert col.type == ListExtensionType()

Review Comment:
   `pa.register_extension_type` appears to raise built-in `KeyError` on 
duplicate registration (this file already tests that behavior). Catching 
`pa.ArrowKeyError` here won't prevent failures when the type is already 
registered (e.g., between these two tests). Prefer using the existing 
`registered_extension_type(...)` context manager in this file (it also 
unregisters on teardown), or catch `KeyError` instead.
   



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -2120,3 +2120,80 @@ def test_json(storage_type, pickle_module):
                 pa.ArrowInvalid,
                 match=f"Invalid storage type for JsonExtensionType: 
{storage_type}"):
             pa.json_(storage_type)
+
+
+class ListExtensionType(pa.ExtensionType):
+    """Extension type with a list field for testing int32 overflow."""
+
+    def __init__(self):
+        super().__init__(
+            pa.struct({"data": pa.list_(pa.uint8())}),
+            "test_list_ext",
+        )
+
+    def __arrow_ext_serialize__(self):
+        return b""
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+        return cls()
+
+
[email protected]_memory
[email protected]
+def test_extension_type_list_overflow():
+    """
+    Test that extension types with list fields handle int32 offset overflow.
+    """
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Create data that exceeds int32 max cumulative values
+    # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
+    arr = np.zeros(500_000_000, dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(5)]
+
+    result = pa.Table.from_pylist(rows, schema=schema)
+
+    assert result.num_rows == 5
+    assert result.num_columns == 1
+    assert result.schema[0].type == ListExtensionType()
+
+    col = result.column(0)
+    assert isinstance(col, pa.ChunkedArray)
+    assert col.type == ListExtensionType()
+
+    for chunk_idx in range(col.num_chunks):
+        chunk_data = col.chunk(chunk_idx)
+        assert chunk_data.type == ListExtensionType()
+
+
[email protected]
+def test_extension_type_no_overflow():
+    """Test that extension types work normally when there's no overflow."""
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        # Already registered
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Small data that won't overflow
+    arr = np.array([1, 2, 3], dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(3)]
+
+    result = pa.Table.from_pylist(rows, schema=schema)
+
+    assert result.num_rows == 3
+    assert result.num_columns == 1
+    assert result.schema[0].type == ListExtensionType()
+
+    # The column should be a ChunkedArray with a single chunk
+    col = result.column(0)
+    assert isinstance(col, pa.ChunkedArray)
+    assert col.type == ListExtensionType()

Review Comment:
   This test registers a new extension type but never unregisters it, which can 
leak global state across the test suite. Since this file already provides 
`registered_extension_type(ext_type)` (with unregister in `finally`), consider 
using it here to ensure cleanup even on failure.
   



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -2120,3 +2120,80 @@ def test_json(storage_type, pickle_module):
                 pa.ArrowInvalid,
                 match=f"Invalid storage type for JsonExtensionType: 
{storage_type}"):
             pa.json_(storage_type)
+
+
+class ListExtensionType(pa.ExtensionType):
+    """Extension type with a list field for testing int32 overflow."""
+
+    def __init__(self):
+        super().__init__(
+            pa.struct({"data": pa.list_(pa.uint8())}),
+            "test_list_ext",
+        )
+
+    def __arrow_ext_serialize__(self):
+        return b""
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+        return cls()
+
+
[email protected]_memory
[email protected]
+def test_extension_type_list_overflow():
+    """
+    Test that extension types with list fields handle int32 offset overflow.
+    """
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Create data that exceeds int32 max cumulative values
+    # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
+    arr = np.zeros(500_000_000, dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(5)]

Review Comment:
   Given this test allocates/constructs data on the order of multiple 
gigabytes, it likely belongs under the same `@pytest.mark.slow` marker used by 
other `large_memory` overflow tests in the suite. Without it, the test may 
unexpectedly run in CI configurations that include `large_memory` but exclude 
`slow`.



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -2120,3 +2120,80 @@ def test_json(storage_type, pickle_module):
                 pa.ArrowInvalid,
                 match=f"Invalid storage type for JsonExtensionType: 
{storage_type}"):
             pa.json_(storage_type)
+
+
+class ListExtensionType(pa.ExtensionType):
+    """Extension type with a list field for testing int32 overflow."""
+
+    def __init__(self):
+        super().__init__(
+            pa.struct({"data": pa.list_(pa.uint8())}),
+            "test_list_ext",
+        )
+
+    def __arrow_ext_serialize__(self):
+        return b""
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+        return cls()
+
+
[email protected]_memory
[email protected]
+def test_extension_type_list_overflow():
+    """
+    Test that extension types with list fields handle int32 offset overflow.
+    """
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Create data that exceeds int32 max cumulative values
+    # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
+    arr = np.zeros(500_000_000, dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(5)]
+
+    result = pa.Table.from_pylist(rows, schema=schema)
+
+    assert result.num_rows == 5
+    assert result.num_columns == 1
+    assert result.schema[0].type == ListExtensionType()
+
+    col = result.column(0)
+    assert isinstance(col, pa.ChunkedArray)
+    assert col.type == ListExtensionType()
+
+    for chunk_idx in range(col.num_chunks):
+        chunk_data = col.chunk(chunk_idx)
+        assert chunk_data.type == ListExtensionType()

Review Comment:
   The test intends to cover the int32-overflow auto-chunking path, but it 
never asserts that chunking actually happened (e.g., `col.num_chunks > 1`). 
Adding an assertion on the number of chunks would ensure the test fails if the 
overflow condition isn't triggered (or if the code regresses to producing a 
single chunk).



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -2120,3 +2120,80 @@ def test_json(storage_type, pickle_module):
                 pa.ArrowInvalid,
                 match=f"Invalid storage type for JsonExtensionType: 
{storage_type}"):
             pa.json_(storage_type)
+
+
+class ListExtensionType(pa.ExtensionType):
+    """Extension type with a list field for testing int32 overflow."""
+
+    def __init__(self):
+        super().__init__(
+            pa.struct({"data": pa.list_(pa.uint8())}),
+            "test_list_ext",
+        )
+
+    def __arrow_ext_serialize__(self):
+        return b""
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+        return cls()
+
+
[email protected]_memory
[email protected]
+def test_extension_type_list_overflow():
+    """
+    Test that extension types with list fields handle int32 offset overflow.
+    """
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Create data that exceeds int32 max cumulative values
+    # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
+    arr = np.zeros(500_000_000, dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(5)]
+
+    result = pa.Table.from_pylist(rows, schema=schema)
+
+    assert result.num_rows == 5
+    assert result.num_columns == 1
+    assert result.schema[0].type == ListExtensionType()
+
+    col = result.column(0)
+    assert isinstance(col, pa.ChunkedArray)
+    assert col.type == ListExtensionType()
+
+    for chunk_idx in range(col.num_chunks):
+        chunk_data = col.chunk(chunk_idx)
+        assert chunk_data.type == ListExtensionType()
+
+
[email protected]
+def test_extension_type_no_overflow():
+    """Test that extension types work normally when there's no overflow."""
+    try:
+        pa.register_extension_type(ListExtensionType())
+    except pa.ArrowKeyError:
+        # Already registered
+        pass
+
+    schema = pa.schema({"col": ListExtensionType()})
+
+    # Small data that won't overflow
+    arr = np.array([1, 2, 3], dtype=np.uint8)
+    rows = [{"col": {"data": arr}} for _ in range(3)]
+
+    result = pa.Table.from_pylist(rows, schema=schema)
+
+    assert result.num_rows == 3
+    assert result.num_columns == 1
+    assert result.schema[0].type == ListExtensionType()
+
+    # The column should be a ChunkedArray with a single chunk
+    col = result.column(0)
+    assert isinstance(col, pa.ChunkedArray)

Review Comment:
   The comment says the column should have a single chunk, but the test doesn't 
assert `col.num_chunks == 1`. Adding that assertion would make the expectation 
explicit and prevent silent behavior changes.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-50012: [Python] Fix list_ storage crashes when values exceed int32 offsets [arrow]

Reply via email to