This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new a81e6c6015 GH-46179: [Python] Bump index level once if pandas df
already contains __index_level_i__ column (#46884)
a81e6c6015 is described below
commit a81e6c6015db4ab024bfd3a2d8666bde014ba6f1
Author: Alenka Frim <[email protected]>
AuthorDate: Mon Jun 15 10:52:55 2026 +0200
GH-46179: [Python] Bump index level once if pandas df already contains
__index_level_i__ column (#46884)
### Rationale for this change
PyArrow adds a `__inex_level_i__` column to the PyArrow table if the pandas
dataframe has an unnamed index it wants to preserve. Currently that creates a
duplicate, if such a column already exists.
### What changes are included in this PR?
Bumping the integer number in the generated column in order to not get any
duplicates.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #46179
Authored-by: AlenkaF <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/pandas_compat.py | 9 +++++++--
python/pyarrow/tests/test_pandas.py | 38 ++++++++++++++++++++++++++++++++++++-
2 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index d27a95b9f9..d8fd383d31 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -378,7 +378,10 @@ def _index_level_name(index, i, column_names):
if index.name is not None and index.name not in column_names:
return _column_name_to_strings(index.name)
else:
- return f'__index_level_{i:d}__'
+ j = i
+ while f'__index_level_{j:d}__' in column_names:
+ j += 1
+ return f'__index_level_{j:d}__'
def _get_columns_to_convert(df, schema, preserve_index, columns):
@@ -419,7 +422,9 @@ def _get_columns_to_convert(df, schema, preserve_index,
columns):
index_descriptors = []
index_column_names = []
for i, index_level in enumerate(index_levels):
- name = _index_level_name(index_level, i, column_names)
+ name = _index_level_name(
+ index_level, i, column_names + index_column_names
+ )
if (isinstance(index_level, _pandas_api.pd.RangeIndex) and
preserve_index is None):
descr = _get_range_index_descriptor(index_level)
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 326473b109..c42dc04cd1 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -404,7 +404,8 @@ class TestConvertMetadata:
assert col3['name'] == col3['field_name']
idx0_descr, foo_descr = js['index_columns']
- assert idx0_descr == '__index_level_0__'
+ # __index_level_0__ exists, unnamed bumped to __index_level_1__
+ assert idx0_descr == '__index_level_1__'
assert idx0['field_name'] == idx0_descr
assert idx0['name'] is None
@@ -412,6 +413,41 @@ class TestConvertMetadata:
assert foo['field_name'] == foo_descr
assert foo['name'] == foo_descr
+ def test_index_level_name_bump(self):
+ # GH-46179
+ df = pd.DataFrame(
+ {"col": [1, 2, 3], "__index_level_0__": [4, 5, 6]},
+ index=[10, 20, 30],
+ )
+ _check_pandas_roundtrip(df, preserve_index=True)
+
+ # Explicit test
+ t = pa.table(df)
+ expected_schema = pa.schema([
+ ("col", pa.int64()),
+ ("__index_level_0__", pa.int64()),
+ ("__index_level_1__", pa.int64())
+ ])
+ assert t.schema.equals(expected_schema)
+
+ df2 = t.to_pandas()
+ assert df2.index.equals(pd.Index([10, 20, 30]))
+ assert df2.ndim == df.ndim == 2
+
+ def test_index_level_name_bump_multiindex(self):
+ # GH-46179
+ df = pd.DataFrame(
+ {"col": [1, 2], "__index_level_0__": [3, 4]},
+ index=pd.MultiIndex.from_arrays(
+ [[10, 20], [100, 200]], names=[None, None]
+ ),
+ )
+ _check_pandas_roundtrip(df, preserve_index=True)
+
+ t = pa.Table.from_pandas(df, preserve_index=True)
+ assert t.schema.names == ['col', '__index_level_0__',
+ '__index_level_1__', '__index_level_2__']
+
def test_categorical_column_index(self):
df = pd.DataFrame(
[(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],