This is an automated email from the ASF dual-hosted git repository.
alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 008e082cbc GH-49002: [Python] Fix array.to_pandas string type
conversion for arrays with None (#49247)
008e082cbc is described below
commit 008e082cbc08052c9215660a918c748b359281e5
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Apr 1 16:01:07 2026 +0200
GH-49002: [Python] Fix array.to_pandas string type conversion for arrays
with None (#49247)
### Rationale for this change
The conversion from array with string type to pandas series, when array
only has a `None` element, has been taking the old code path even with pandas
3.0.
### What changes are included in this PR?
Always check `dtype` in the `_array_like_to_pandas ` conversion and use
pandas new default string `dtype` if available.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No, only bug fix.
* GitHub Issue: #49002
Lead-authored-by: Alenka Frim <[email protected]>
Co-authored-by: AlenkaF <[email protected]>
Co-authored-by: Raúl Cumplido <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
---
python/pyarrow/array.pxi | 7 ++++++
python/pyarrow/tests/test_pandas.py | 48 ++++++++++++++++++++++++++++++++++---
2 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 76f8fec337..b7f3a46f9e 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -2349,6 +2349,13 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
dtype = original_type.to_pandas_dtype()
except NotImplementedError:
pass
+ elif pandas_api.uses_string_dtype() and not
options["strings_to_categorical"] and (
+ original_type.id == _Type_STRING or
+ original_type.id == _Type_LARGE_STRING or
+ original_type.id == _Type_STRING_VIEW
+ ):
+ # for pandas 3.0+, use pandas' new default string dtype
+ dtype = pandas_api.pd.StringDtype(na_value=np.nan)
# Only call __from_arrow__ for Arrow extension types or when explicitly
# overridden via types_mapper
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 5fde980dd8..0339975f45 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2975,7 +2975,9 @@ class TestZeroCopyConversion:
arr.to_pandas(zero_copy_only=True)
def test_zero_copy_failure_on_object_types(self):
- self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
+ if Version(pd.__version__) < Version("3.0.0"):
+ # pandas 3.0 includes default string dtype support
+ self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
def test_zero_copy_failure_with_int_when_nulls(self):
self.check_zero_copy_failure(pa.array([0, 1, None]))
@@ -3047,6 +3049,10 @@ class TestConvertMisc:
def test_empty_arrays(self):
for dtype_str, pa_type in self.type_pairs:
+ if (Version(pd.__version__) >= Version("3.0.0") and
+ pa_type == pa.string()):
+ # PyArrow backed string dtype are set by default
+ dtype_str = 'str'
arr = np.array([], dtype=np.dtype(dtype_str))
_check_array_roundtrip(arr, type=pa_type)
@@ -3231,13 +3237,19 @@ class TestConvertMisc:
empty_objects = pd.Series(np.array([], dtype=object))
tm.assert_series_equal(arr.to_pandas(),
pd.Series(np.array([], dtype=np.int64)))
- arr = pa.array([], type=pa.string())
- tm.assert_series_equal(arr.to_pandas(), empty_objects)
arr = pa.array([], type=pa.list_(pa.int64()))
tm.assert_series_equal(arr.to_pandas(), empty_objects)
arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
tm.assert_series_equal(arr.to_pandas(), empty_objects)
+ arr = pa.array([], type=pa.string())
+ if Version(pd.__version__) >= Version("3.0.0"):
+ # PyArrow backed string dtype are set by default
+ empty_str = pd.Series([], dtype=str)
+ tm.assert_series_equal(arr.to_pandas(), empty_str)
+ else:
+ tm.assert_series_equal(arr.to_pandas(), empty_objects)
+
def test_non_natural_stride(self):
"""
ARROW-2172: converting from a Numpy array with a stride that's
@@ -4652,6 +4664,36 @@ def test_chunked_array_to_pandas_types_mapper():
assert result.dtype == np.dtype("int64")
[email protected](
+ "string_type", [pa.string(), pa.large_string(), pa.string_view()]
+)
[email protected]("data", [[], [None]])
+def test_array_to_pandas_string_dtype(string_type, data):
+ # GH-49002
+ if Version(pd.__version__) < Version("3.0.0"):
+ pytest.skip("PyArrow backed string dtype missing")
+
+ arr = pa.array(data, type=string_type)
+ result = arr.to_pandas()
+ assert result.dtype == pd.StringDtype(na_value=np.nan)
+
+ arr = pa.chunked_array([data], type=string_type)
+ result = arr.to_pandas()
+ assert result.dtype == pd.StringDtype(na_value=np.nan)
+
+ # Test types_mapper takes precedence
+ types_mapper = {string_type: None}.get
+ result = arr.to_pandas(types_mapper=types_mapper)
+ assert result.dtype == np.dtype("object")
+
+ # Test strings_to_categorical
+ result = arr.to_pandas(strings_to_categorical=False)
+ assert result.dtype == pd.StringDtype(na_value=np.nan)
+ result = arr.to_pandas(strings_to_categorical=True)
+ assert result.dtype == pd.CategoricalDtype(categories=[],
+ ordered=False)
+
+
# ----------------------------------------------------------------------
# Legacy metadata compatibility tests