This is an automated email from the ASF dual-hosted git repository.

alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 008e082cbc GH-49002: [Python] Fix array.to_pandas string type 
conversion for arrays with None (#49247)
008e082cbc is described below

commit 008e082cbc08052c9215660a918c748b359281e5
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Apr 1 16:01:07 2026 +0200

    GH-49002: [Python] Fix array.to_pandas string type conversion for arrays 
with None (#49247)
    
    ### Rationale for this change
    
    The conversion from array with string type to pandas series, when array 
only has a `None` element, has been taking the old code path even with pandas 
3.0.
    
    ### What changes are included in this PR?
    
    Always check `dtype`  in the `_array_like_to_pandas ` conversion and use 
pandas new default string `dtype` if available.
    
    ### Are these changes tested?
    Yes.
    
    ### Are there any user-facing changes?
    No, only bug fix.
    * GitHub Issue: #49002
    
    Lead-authored-by: Alenka Frim <[email protected]>
    Co-authored-by: AlenkaF <[email protected]>
    Co-authored-by: Raúl Cumplido <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: AlenkaF <[email protected]>
---
 python/pyarrow/array.pxi            |  7 ++++++
 python/pyarrow/tests/test_pandas.py | 48 ++++++++++++++++++++++++++++++++++---
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 76f8fec337..b7f3a46f9e 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -2349,6 +2349,13 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
             dtype = original_type.to_pandas_dtype()
         except NotImplementedError:
             pass
+    elif pandas_api.uses_string_dtype() and not 
options["strings_to_categorical"] and (
+        original_type.id == _Type_STRING or
+        original_type.id == _Type_LARGE_STRING or
+        original_type.id == _Type_STRING_VIEW
+    ):
+        # for pandas 3.0+, use pandas' new default string dtype
+        dtype = pandas_api.pd.StringDtype(na_value=np.nan)
 
     # Only call __from_arrow__ for Arrow extension types or when explicitly
     # overridden via types_mapper
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 5fde980dd8..0339975f45 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2975,7 +2975,9 @@ class TestZeroCopyConversion:
             arr.to_pandas(zero_copy_only=True)
 
     def test_zero_copy_failure_on_object_types(self):
-        self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
+        if Version(pd.__version__) < Version("3.0.0"):
+            # pandas 3.0 includes default string dtype support
+            self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
 
     def test_zero_copy_failure_with_int_when_nulls(self):
         self.check_zero_copy_failure(pa.array([0, 1, None]))
@@ -3047,6 +3049,10 @@ class TestConvertMisc:
 
     def test_empty_arrays(self):
         for dtype_str, pa_type in self.type_pairs:
+            if (Version(pd.__version__) >= Version("3.0.0") and
+                    pa_type == pa.string()):
+                # PyArrow backed string dtype are set by default
+                dtype_str = 'str'
             arr = np.array([], dtype=np.dtype(dtype_str))
             _check_array_roundtrip(arr, type=pa_type)
 
@@ -3231,13 +3237,19 @@ class TestConvertMisc:
         empty_objects = pd.Series(np.array([], dtype=object))
         tm.assert_series_equal(arr.to_pandas(),
                                pd.Series(np.array([], dtype=np.int64)))
-        arr = pa.array([], type=pa.string())
-        tm.assert_series_equal(arr.to_pandas(), empty_objects)
         arr = pa.array([], type=pa.list_(pa.int64()))
         tm.assert_series_equal(arr.to_pandas(), empty_objects)
         arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
         tm.assert_series_equal(arr.to_pandas(), empty_objects)
 
+        arr = pa.array([], type=pa.string())
+        if Version(pd.__version__) >= Version("3.0.0"):
+            # PyArrow backed string dtype are set by default
+            empty_str = pd.Series([], dtype=str)
+            tm.assert_series_equal(arr.to_pandas(), empty_str)
+        else:
+            tm.assert_series_equal(arr.to_pandas(), empty_objects)
+
     def test_non_natural_stride(self):
         """
         ARROW-2172: converting from a Numpy array with a stride that's
@@ -4652,6 +4664,36 @@ def test_chunked_array_to_pandas_types_mapper():
     assert result.dtype == np.dtype("int64")
 
 
[email protected](
+    "string_type", [pa.string(), pa.large_string(), pa.string_view()]
+)
[email protected]("data", [[], [None]])
+def test_array_to_pandas_string_dtype(string_type, data):
+    # GH-49002
+    if Version(pd.__version__) < Version("3.0.0"):
+        pytest.skip("PyArrow backed string dtype missing")
+
+    arr = pa.array(data, type=string_type)
+    result = arr.to_pandas()
+    assert result.dtype == pd.StringDtype(na_value=np.nan)
+
+    arr = pa.chunked_array([data], type=string_type)
+    result = arr.to_pandas()
+    assert result.dtype == pd.StringDtype(na_value=np.nan)
+
+    # Test types_mapper takes precedence
+    types_mapper = {string_type: None}.get
+    result = arr.to_pandas(types_mapper=types_mapper)
+    assert result.dtype == np.dtype("object")
+
+    # Test strings_to_categorical
+    result = arr.to_pandas(strings_to_categorical=False)
+    assert result.dtype == pd.StringDtype(na_value=np.nan)
+    result = arr.to_pandas(strings_to_categorical=True)
+    assert result.dtype == pd.CategoricalDtype(categories=[],
+                                               ordered=False)
+
+
 # ----------------------------------------------------------------------
 # Legacy metadata compatibility tests
 

Reply via email to