(arrow) branch main updated: GH-43683: [Python] Use pandas StringDtype when enabled (pandas 3+) (#44195)

jorisvandenbossche Thu, 09 Jan 2025 11:22:17 -0800

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 5181c24f6d GH-43683: [Python] Use pandas StringDtype when enabled 
(pandas 3+) (#44195)
5181c24f6d is described below

commit 5181c24f6d780591426e0af0d1b72c68f190484c
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Thu Jan 9 20:22:01 2025 +0100

    GH-43683: [Python] Use pandas StringDtype when enabled (pandas 3+) (#44195)
    
    ### Rationale for this change
    
    With pandas' 
[PDEP-14](https://pandas.pydata.org/pdeps/0014-string-dtype.html) proposal, 
pandas is planning to introduce a default string dtype in pandas 3.0 (instead 
of the current object dtype).
    
    This will become the default in pandas 3.0, and can be enabled with an 
option in the upcoming pandas 2.3 (`pd.options.future.infer_string = True`). To 
prepare for that, we should start using that string dtype in `to_pandas()` 
conversions when that option is enabled.
    
    ### What changes are included in this PR?
    
    - If pandas >= 3.0 is used or the pandas option is enabled, ensure that 
`to_pandas()` calls use the default string dtype of pandas for string-like 
columns (string, large_string, string_view)
    
    ### Are these changes tested?
    
    It is tested in the pandas-nightly crossbow build.
    
    There is still one failure that is because of a bug on the pandas side 
(https://github.com/pandas-dev/pandas/issues/59879)
    
    ### Are there any user-facing changes?
    
    **This PR includes breaking changes to public APIs.** Depending on the 
version of pandas, `to_pandas()` will change to use pandas' string dtype 
instead of object dtype. This is a breaking user-facing change, but essentially 
just following the equivalent change in default dtype on the pandas side.
    
    * GitHub Issue: #43683
    
    Lead-authored-by: Joris Van den Bossche <[email protected]>
    Co-authored-by: Raúl Cumplido <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 dev/tasks/tasks.yml                  |  6 +++
 docker-compose.yml                   |  1 +
 python/pyarrow/array.pxi             |  2 +
 python/pyarrow/pandas-shim.pxi       | 17 +++++++-
 python/pyarrow/pandas_compat.py      | 62 +++++++++++++++++++++++++----
 python/pyarrow/tests/test_compute.py | 19 ++++-----
 python/pyarrow/tests/test_feather.py |  6 ++-
 python/pyarrow/tests/test_pandas.py  | 77 ++++++++++++++++++++++++++++--------
 8 files changed, 155 insertions(+), 35 deletions(-)

diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index f5b5ab53eb..93450df155 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1426,6 +1426,12 @@ tasks:
         # ensure we have at least one build with parquet encryption disabled
         PARQUET_REQUIRE_ENCRYPTION: "OFF"
       {% endif %}
+      {% if pandas_version == "nightly" %}
+        # TODO can be removed once this is enabled by default in pandas >= 3
+        # This is to enable the Pandas feature.
+        # See: https://github.com/pandas-dev/pandas/pull/58459
+        PANDAS_FUTURE_INFER_STRING: "1"
+      {% endif %}
     {% if not cache_leaf %}
       # use the latest pandas release, so prevent reusing any cached layers
       flags: --no-leaf-cache
diff --git a/docker-compose.yml b/docker-compose.yml
index 5cb96e62c1..43dd3511fc 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1375,6 +1375,7 @@ services:
       PYTEST_ARGS:  # inherit
       HYPOTHESIS_PROFILE:  # inherit
       PYARROW_TEST_HYPOTHESIS:  # inherit
+      PANDAS_FUTURE_INFER_STRING:  # inherit
     volumes: *conda-volumes
     command: *python-conda-command
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f86caf1433..2ef42051d9 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -117,6 +117,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size):
                         "return a pyarrow Array or ChunkedArray.")
     if isinstance(res, ChunkedArray) and res.num_chunks==1:
         res = res.chunk(0)
+    if type is not None and res.type != type:
+        res = res.cast(type)
     return res
 
 
diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
index 74f0d981b5..5be6f03f86 100644
--- a/python/pyarrow/pandas-shim.pxi
+++ b/python/pyarrow/pandas-shim.pxi
@@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
         object _array_like_types, _is_extension_array_dtype, _lock
         bint has_sparse
         bint _pd024
-        bint _is_v1, _is_ge_v21, _is_ge_v3
+        bint _is_v1, _is_ge_v21, _is_ge_v3, _is_ge_v3_strict
 
     def __init__(self):
         self._lock = Lock()
@@ -80,6 +80,7 @@ cdef class _PandasAPIShim(object):
         self._is_v1 = self._loose_version < Version('2.0.0')
         self._is_ge_v21 = self._loose_version >= Version('2.1.0')
         self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')
+        self._is_ge_v3_strict = self._loose_version >= Version('3.0.0')
 
         self._compat_module = pdcompat
         self._data_frame = pd.DataFrame
@@ -174,6 +175,20 @@ cdef class _PandasAPIShim(object):
         self._check_import()
         return self._is_ge_v3
 
+    def is_ge_v3_strict(self):
+        self._check_import()
+        return self._is_ge_v3_strict
+
+    def uses_string_dtype(self):
+        if self.is_ge_v3_strict():
+            return True
+        try:
+            if self.pd.options.future.infer_string:
+                return True
+        except:
+            pass
+        return False
+
     @property
     def categorical_type(self):
         self._check_import()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index d0582f825b..e9655914ad 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -174,7 +174,11 @@ def get_column_metadata(column, name, arrow_type, 
field_name):
         }
         string_dtype = 'object'
 
-    if name is not None and not isinstance(name, str):
+    if (
+        name is not None
+        and not (isinstance(name, float) and np.isnan(name))
+        and not isinstance(name, str)
+    ):
         raise TypeError(
             'Column name must be a string. Got column {} of type {}'.format(
                 name, type(name).__name__
@@ -340,8 +344,8 @@ def _column_name_to_strings(name):
         return str(tuple(map(_column_name_to_strings, name)))
     elif isinstance(name, Sequence):
         raise TypeError("Unsupported type for MultiIndex level")
-    elif name is None:
-        return None
+    elif name is None or (isinstance(name, float) and np.isnan(name)):
+        return name
     return str(name)
 
 
@@ -790,10 +794,12 @@ def table_to_dataframe(
         table, index = _reconstruct_index(table, index_descriptors,
                                           all_columns, types_mapper)
         ext_columns_dtypes = _get_extension_dtypes(
-            table, all_columns, types_mapper)
+            table, all_columns, types_mapper, options, categories)
     else:
         index = _pandas_api.pd.RangeIndex(table.num_rows)
-        ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper)
+        ext_columns_dtypes = _get_extension_dtypes(
+            table, [], types_mapper, options, categories
+        )
 
     _check_data_column_metadata_consistency(all_columns)
     columns = _deserialize_column_index(table, all_columns, column_indexes)
@@ -838,7 +844,7 @@ _pandas_supported_numpy_types = {
 }
 
 
-def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
+def _get_extension_dtypes(table, columns_metadata, types_mapper, options, 
categories):
     """
     Based on the stored column pandas metadata and the extension types
     in the arrow schema, infer which columns should be converted to a
@@ -851,6 +857,9 @@ def _get_extension_dtypes(table, columns_metadata, 
types_mapper=None):
     and then we can check if this dtype supports conversion from arrow.
 
     """
+    strings_to_categorical = options["strings_to_categorical"]
+    categories = categories or []
+
     ext_columns = {}
 
     # older pandas version that does not yet support extension dtypes
@@ -889,9 +898,32 @@ def _get_extension_dtypes(table, columns_metadata, 
types_mapper=None):
             # that are certainly numpy dtypes
             pandas_dtype = _pandas_api.pandas_dtype(dtype)
             if isinstance(pandas_dtype, _pandas_api.extension_dtype):
+                if isinstance(pandas_dtype, _pandas_api.pd.StringDtype):
+                    # when the metadata indicate to use the string dtype,
+                    # ignore this in case:
+                    # - it is specified to convert strings / this column to 
categorical
+                    # - the column itself is dictionary encoded and would 
otherwise be
+                    #   converted to categorical
+                    if strings_to_categorical or name in categories:
+                        continue
+                    try:
+                        if 
pa.types.is_dictionary(table.schema.field(name).type):
+                            continue
+                    except KeyError:
+                        pass
                 if hasattr(pandas_dtype, "__from_arrow__"):
                     ext_columns[name] = pandas_dtype
 
+    # for pandas 3.0+, use pandas' new default string dtype
+    if _pandas_api.uses_string_dtype() and not strings_to_categorical:
+        for field in table.schema:
+            if field.name not in ext_columns and (
+                pa.types.is_string(field.type)
+                or pa.types.is_large_string(field.type)
+                or pa.types.is_string_view(field.type)
+            ) and field.name not in categories:
+                ext_columns[field.name] = 
_pandas_api.pd.StringDtype(na_value=np.nan)
+
     return ext_columns
 
 
@@ -1049,9 +1081,9 @@ def get_pandas_logical_type_map():
             'date': 'datetime64[D]',
             'datetime': 'datetime64[ns]',
             'datetimetz': 'datetime64[ns]',
-            'unicode': np.str_,
+            'unicode': 'str',
             'bytes': np.bytes_,
-            'string': np.str_,
+            'string': 'str',
             'integer': np.int64,
             'floating': np.float64,
             'decimal': np.object_,
@@ -1142,6 +1174,20 @@ def _reconstruct_columns_from_metadata(columns, 
column_indexes):
         # GH-41503: if the column index was decimal, restore to decimal
         elif pandas_dtype == "decimal":
             level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level])
+        elif (
+            level.dtype == "str" and numpy_dtype == "object"
+            and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", 
"string"])
+        ):
+            # the metadata indicate that the original dataframe used object 
dtype,
+            # but ignore this and keep string dtype if:
+            # - the original columns used mixed types -> we don't attempt to 
faithfully
+            #   roundtrip in this case, but keep the column names as strings
+            # - the original columns were inferred to be strings but stored in 
object
+            #   dtype -> we don't restore the object dtype because all metadata
+            #   generated using pandas < 3 will have this case by default, and
+            #   for pandas >= 3 we want to use the default string dtype for 
.columns
+            new_levels.append(level)
+            continue
         elif level.dtype != dtype:
             level = level.astype(dtype)
         # ARROW-9096: if original DataFrame was upcast we keep that
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index e6fcd6149e..6f28205a18 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -1020,7 +1020,7 @@ def test_replace_slice():
     offsets = range(-3, 4)
 
     arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde'])
-    series = arr.to_pandas()
+    series = arr.to_pandas().astype(object).replace({np.nan: None})
     for start in offsets:
         for stop in offsets:
             expected = series.str.slice_replace(start, stop, 'XX')
@@ -1031,7 +1031,7 @@ def test_replace_slice():
             assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual
 
     arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde'])
-    series = arr.to_pandas()
+    series = arr.to_pandas().astype(object).replace({np.nan: None})
     for start in offsets:
         for stop in offsets:
             expected = series.str.slice_replace(start, stop, 'XX')
@@ -2132,7 +2132,8 @@ def test_strftime():
             for fmt in formats:
                 options = pc.StrftimeOptions(fmt)
                 result = pc.strftime(tsa, options=options)
-                expected = pa.array(ts.strftime(fmt))
+                # cast to the same type as result to ignore string vs 
large_string
+                expected = pa.array(ts.strftime(fmt)).cast(result.type)
                 assert result.equals(expected)
 
         fmt = "%Y-%m-%dT%H:%M:%S"
@@ -2140,34 +2141,34 @@ def test_strftime():
         # Default format
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         result = pc.strftime(tsa, options=pc.StrftimeOptions())
-        expected = pa.array(ts.strftime(fmt))
+        expected = pa.array(ts.strftime(fmt)).cast(result.type)
         assert result.equals(expected)
 
         # Default format plus timezone
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
-        expected = pa.array(ts.strftime(fmt + "%Z"))
+        expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type)
         assert result.equals(expected)
 
         # Pandas %S is equivalent to %S in arrow for unit="s"
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         options = pc.StrftimeOptions("%S")
         result = pc.strftime(tsa, options=options)
-        expected = pa.array(ts.strftime("%S"))
+        expected = pa.array(ts.strftime("%S")).cast(result.type)
         assert result.equals(expected)
 
         # Pandas %S.%f is equivalent to %S in arrow for unit="us"
         tsa = pa.array(ts, type=pa.timestamp("us", timezone))
         options = pc.StrftimeOptions("%S")
         result = pc.strftime(tsa, options=options)
-        expected = pa.array(ts.strftime("%S.%f"))
+        expected = pa.array(ts.strftime("%S.%f")).cast(result.type)
         assert result.equals(expected)
 
         # Test setting locale
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         options = pc.StrftimeOptions(fmt, locale="C")
         result = pc.strftime(tsa, options=options)
-        expected = pa.array(ts.strftime(fmt))
+        expected = pa.array(ts.strftime(fmt)).cast(result.type)
         assert result.equals(expected)
 
     # Test timestamps without timezone
@@ -2175,7 +2176,7 @@ def test_strftime():
     ts = pd.to_datetime(times)
     tsa = pa.array(ts, type=pa.timestamp("s"))
     result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
-    expected = pa.array(ts.strftime(fmt))
+    expected = pa.array(ts.strftime(fmt)).cast(result.type)
 
     # Positional format
     assert pc.strftime(tsa, fmt) == result
diff --git a/python/pyarrow/tests/test_feather.py 
b/python/pyarrow/tests/test_feather.py
index 18c8cd5b65..249fb62127 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -426,7 +426,11 @@ def test_empty_strings(version):
 @pytest.mark.pandas
 def test_all_none(version):
     df = pd.DataFrame({'all_none': [None] * 10})
-    _check_pandas_roundtrip(df, version=version)
+    if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype():
+        expected = df.astype("str")
+    else:
+        expected = df
+    _check_pandas_roundtrip(df, version=version, expected=expected)
 
 
 @pytest.mark.pandas
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index d5c936df07..f356874c57 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -349,6 +349,17 @@ class TestConvertMetadata:
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)
 
+    def test_float_column_index_with_missing(self):
+        df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=[1.5, 
np.nan])
+        _check_pandas_roundtrip(df, preserve_index=True)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The DataFrame has column names of mixed type:UserWarning"
+    )
+    def test_string_column_index_with_missing(self):
+        df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=["A", None])
+        _check_pandas_roundtrip(df, preserve_index=True)
+
     def test_index_metadata_field_name(self):
         # test None case, and strangely named non-index columns
         df = pd.DataFrame(
@@ -359,8 +370,11 @@ class TestConvertMetadata:
             ),
             columns=['a', None, '__index_level_0__'],
         )
-        with pytest.warns(UserWarning):
+        if _pandas_api.uses_string_dtype():
             t = pa.Table.from_pandas(df, preserve_index=True)
+        else:
+            with pytest.warns(UserWarning):
+                t = pa.Table.from_pandas(df, preserve_index=True)
         js = t.schema.pandas_metadata
 
         col1, col2, col3, idx0, foo = js['columns']
@@ -368,8 +382,12 @@ class TestConvertMetadata:
         assert col1['name'] == 'a'
         assert col1['name'] == col1['field_name']
 
-        assert col2['name'] is None
-        assert col2['field_name'] == 'None'
+        if _pandas_api.uses_string_dtype():
+            assert np.isnan(col2['name'])
+            assert col2['field_name'] == 'nan'
+        else:
+            assert col2['name'] is None
+            assert col2['field_name'] == 'None'
 
         assert col3['name'] == '__index_level_0__'
         assert col3['name'] == col3['field_name']
@@ -411,7 +429,9 @@ class TestConvertMetadata:
         column_indexes, = js['column_indexes']
         assert column_indexes['name'] == 'stringz'
         assert column_indexes['name'] == column_indexes['field_name']
-        assert column_indexes['numpy_type'] == 'object'
+        assert column_indexes['numpy_type'] == (
+            'str' if _pandas_api.uses_string_dtype() else 'object'
+        )
         assert column_indexes['pandas_type'] == 'unicode'
 
         md = column_indexes['metadata']
@@ -1680,7 +1700,10 @@ class TestConvertStringLikeTypes:
         repeats = 1000
         values = ['foo', None, 'bar', 'mañana', np.nan]
         df = pd.DataFrame({'strings': values * repeats})
-        field = pa.field('strings', pa.string())
+        field = pa.field(
+            'strings',
+            pa.large_string() if _pandas_api.uses_string_dtype() else 
pa.string()
+        )
         schema = pa.schema([field])
         ex_values = ['foo', None, 'bar', 'mañana', None]
         expected = pd.DataFrame({'strings': ex_values * repeats})
@@ -3323,6 +3346,10 @@ def _assert_nunique(obj, expected):
 
 
 def test_to_pandas_deduplicate_strings_array_types():
+    if _pandas_api.uses_string_dtype():
+        pytest.skip(
+            "pandas uses string dtype and not object dtype, keyword has no 
effect"
+        )
     nunique = 100
     repeats = 10
     values = _generate_dedup_example(nunique, repeats)
@@ -3335,6 +3362,10 @@ def test_to_pandas_deduplicate_strings_array_types():
 
 
 def test_to_pandas_deduplicate_strings_table_types():
+    if _pandas_api.uses_string_dtype():
+        pytest.skip(
+            "pandas uses string dtype and not object dtype, keyword has no 
effect"
+        )
     nunique = 100
     repeats = 10
     values = _generate_dedup_example(nunique, repeats)
@@ -3798,20 +3829,26 @@ def _check_to_pandas_memory_unchanged(obj, **kwargs):
     x = obj.to_pandas(**kwargs)  # noqa
 
     # Memory allocation unchanged -- either zero copy or self-destructing
-    assert pa.total_allocated_bytes() == prior_allocation
+    if _pandas_api.uses_string_dtype():
+        # for the string array of the columns Index
+        # -> increase the size to account for overallocation for small arrays
+        max_index_allocation = max(192, x.columns.nbytes * 2)
+        assert pa.total_allocated_bytes() <= (prior_allocation + 
max_index_allocation)
+    else:
+        assert pa.total_allocated_bytes() == prior_allocation
 
 
 def test_to_pandas_split_blocks():
     # ARROW-3789
     t = pa.table([
-        pa.array([1, 2, 3, 4, 5], type='i1'),
-        pa.array([1, 2, 3, 4, 5], type='i4'),
-        pa.array([1, 2, 3, 4, 5], type='i8'),
-        pa.array([1, 2, 3, 4, 5], type='f4'),
-        pa.array([1, 2, 3, 4, 5], type='f8'),
-        pa.array([1, 2, 3, 4, 5], type='f8'),
-        pa.array([1, 2, 3, 4, 5], type='f8'),
-        pa.array([1, 2, 3, 4, 5], type='f8'),
+        pa.array([1, 2, 3, 4, 5]*100, type='i1'),
+        pa.array([1, 2, 3, 4, 5]*100, type='i4'),
+        pa.array([1, 2, 3, 4, 5]*100, type='i8'),
+        pa.array([1, 2, 3, 4, 5]*100, type='f4'),
+        pa.array([1, 2, 3, 4, 5]*100, type='f8'),
+        pa.array([1, 2, 3, 4, 5]*100, type='f8'),
+        pa.array([1, 2, 3, 4, 5]*100, type='f8'),
+        pa.array([1, 2, 3, 4, 5]*100, type='f8'),
     ], ['f{}'.format(i) for i in range(8)])
 
     _check_blocks_created(t, 8)
@@ -3856,7 +3893,12 @@ def test_table_uses_memory_pool():
     prior_allocation = pa.total_allocated_bytes()
     x = t.to_pandas()
 
-    assert pa.total_allocated_bytes() == (prior_allocation + 3 * N * 8)
+    new_allocation = 3 * N * 8
+    if _pandas_api.uses_string_dtype():
+        # for the small columns Index
+        new_allocation += 128
+
+    assert pa.total_allocated_bytes() == (prior_allocation + new_allocation)
 
     # Check successful garbage collection
     x = None  # noqa
@@ -4134,7 +4176,10 @@ def test_dictionary_encoded_nested_to_pandas():
 
 def test_dictionary_from_pandas():
     cat = pd.Categorical(['a', 'b', 'a'])
-    expected_type = pa.dictionary(pa.int8(), pa.string())
+    expected_type = pa.dictionary(
+        pa.int8(),
+        pa.large_string() if _pandas_api.uses_string_dtype() else pa.string()
+    )
 
     result = pa.array(cat)
     assert result.to_pylist() == ['a', 'b', 'a']

(arrow) branch main updated: GH-43683: [Python] Use pandas StringDtype when enabled (pandas 3+) (#44195)

Reply via email to