This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7bddb8338a GH-50012: [Python] Fix list_ storage crashes when values
exceed int32 offsets (#50016)
7bddb8338a is described below
commit 7bddb8338a2f7da7c9b274c937a45c6bead4ac72
Author: AnkitAhlawat <[email protected]>
AuthorDate: Wed Jun 3 05:42:21 2026 +0530
GH-50012: [Python] Fix list_ storage crashes when values exceed int32
offsets (#50016)
### Rationale for this change
When data exceeds int32 limits, properly wraps each chunk as ExtensionArray
### What changes are included in this PR?
Modified extension type handling to support both Array and ChunkedArray
storage types.
### Are these changes tested?
Yes , Manually tested the changes
### Are there any user-facing changes?
No
### This PR contains a "Critical Fix".
This change fixes a crash in list_ storage . when list data exceeds int32
limits, PyArrow automatically creates a ChunkedArray. However,
ExtensionArray.from_storage() only accepts Array objects, not ChunkedArray.
* GitHub Issue: #50012
Authored-by: [email protected] <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
python/pyarrow/array.pxi | 2 +-
python/pyarrow/tests/test_extension_type.py | 72 +++++++++++++++++++++++++++++
2 files changed, 73 insertions(+), 1 deletion(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ecdbb342d3..3768c403dd 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -401,7 +401,7 @@ def array(object obj, type=None, mask=None, size=None,
from_pandas=None,
result = _sequence_to_array(obj, mask, size, type, pool,
c_from_pandas)
if extension_type is not None:
- result = ExtensionArray.from_storage(extension_type, result)
+ result = extension_type.wrap_array(result)
return result
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 465b556876..1adbd4e980 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -2120,3 +2120,75 @@ def test_json(storage_type, pickle_module):
pa.ArrowInvalid,
match=f"Invalid storage type for JsonExtensionType:
{storage_type}"):
pa.json_(storage_type)
+
+
+class ListExtensionType(pa.ExtensionType):
+ """Extension type with a list field for testing int32 overflow."""
+
+ def __init__(self):
+ super().__init__(
+ pa.struct({"data": pa.list_(pa.uint8())}),
+ "pyarrow.tests.ListExtensionType",
+ )
+
+ def __arrow_ext_serialize__(self):
+ return b""
+
+ @classmethod
+ def __arrow_ext_deserialize__(cls, storage_type, serialized):
+ return cls()
+
+
[email protected]
[email protected]_memory
[email protected]
+def test_extension_type_list_overflow():
+ """
+ Test that extension types with list fields handle int32 offset overflow.
+ """
+ with registered_extension_type(ListExtensionType()):
+ schema = pa.schema({"col": ListExtensionType()})
+
+ # Create data that exceeds int32 max cumulative values
+ # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
+ arr = np.zeros(500_000_000, dtype=np.uint8)
+ rows = [{"col": {"data": arr}} for _ in range(5)]
+
+ result = pa.Table.from_pylist(rows, schema=schema)
+
+ assert result.num_rows == 5
+ assert result.num_columns == 1
+ assert result.schema[0].type == ListExtensionType()
+
+ col = result.column(0)
+ assert isinstance(col, pa.ChunkedArray)
+ assert col.type == ListExtensionType()
+
+ assert col.num_chunks > 1, "Expected multiple chunks due to int32
overflow"
+
+ for chunk_idx in range(col.num_chunks):
+ chunk_data = col.chunk(chunk_idx)
+ assert chunk_data.type == ListExtensionType()
+
+
[email protected]
+def test_extension_type_no_overflow():
+ """Test that extension types work normally when there's no overflow."""
+ with registered_extension_type(ListExtensionType()):
+ schema = pa.schema({"col": ListExtensionType()})
+
+ # Small data that won't overflow
+ arr = np.array([1, 2, 3], dtype=np.uint8)
+ rows = [{"col": {"data": arr}} for _ in range(3)]
+
+ result = pa.Table.from_pylist(rows, schema=schema)
+
+ assert result.num_rows == 3
+ assert result.num_columns == 1
+ assert result.schema[0].type == ListExtensionType()
+
+ # The column should be a ChunkedArray with a single chunk
+ col = result.column(0)
+ assert isinstance(col, pa.ChunkedArray)
+ assert col.num_chunks == 1
+ assert col.type == ListExtensionType()