[jira] [Commented] (ARROW-1754) [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name

ASF GitHub Bot (JIRA) Tue, 31 Oct 2017 09:52:19 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-1754?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16227097#comment-16227097
 ]


ASF GitHub Bot commented on ARROW-1754:
---------------------------------------

wesm closed pull request #1271: ARROW-1754: [Python] Fix buggy Parquet 
roundtrip when an index name is the same as a column name
URL: https://github.com/apache/arrow/pull/1271
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/ipc/metadata-internal.cc 
b/cpp/src/arrow/ipc/metadata-internal.cc
index f04e9b05a..f0f0f6758 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -72,7 +72,7 @@ MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion 
version) {
     case flatbuf::MetadataVersion_V4:
       // Arrow >= 0.8
       return MetadataVersion::V4;
-      // Add cases as other versions become available
+    // Add cases as other versions become available
     default:
       return MetadataVersion::V4;
   }
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index d6c844c84..1984598ff 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -18,7 +18,6 @@
 import ast
 import collections
 import json
-import re
 
 import numpy as np
 import pandas as pd
@@ -29,13 +28,6 @@
 from pyarrow.compat import PY2, zip_longest  # noqa
 
 
-INDEX_LEVEL_NAME_REGEX = re.compile(r'^__index_level_\d+__$')
-
-
-def is_unnamed_index_level(name):
-    return INDEX_LEVEL_NAME_REGEX.match(name) is not None
-
-
 def infer_dtype(column):
     try:
         return pd.api.types.infer_dtype(column)
@@ -143,7 +135,7 @@ def get_column_metadata(column, name, arrow_type):
 
     Parameters
     ----------
-    column : pandas.Series
+    column : pandas.Series or pandas.Index
     name : str
     arrow_type : pyarrow.DataType
 
@@ -161,7 +153,7 @@ def get_column_metadata(column, name, arrow_type):
         }
         string_dtype = 'object'
 
-    if not isinstance(name, six.string_types):
+    if name is not None and not isinstance(name, six.string_types):
         raise TypeError(
             'Column name must be a string. Got column {} of type {}'.format(
                 name, type(name).__name__
@@ -176,23 +168,7 @@ def get_column_metadata(column, name, arrow_type):
     }
 
 
-def index_level_name(index, i):
-    """Return the name of an index level or a default name if `index.name` is
-    None.
-
-    Parameters
-    ----------
-    index : pandas.Index
-    i : int
-
-    Returns
-    -------
-    name : str
-    """
-    if index.name is not None:
-        return index.name
-    else:
-        return '__index_level_{:d}__'.format(i)
+index_level_name = '__index_level_{:d}__'.format
 
 
 def construct_metadata(df, column_names, index_levels, preserve_index, types):
@@ -222,11 +198,11 @@ def construct_metadata(df, column_names, index_levels, 
preserve_index, types):
     ]
 
     if preserve_index:
-        index_column_names = [index_level_name(level, i)
-                              for i, level in enumerate(index_levels)]
+        index_column_names = list(map(
+            index_level_name, range(len(index_levels))
+        ))
         index_column_metadata = [
-            get_column_metadata(level, name=index_level_name(level, i),
-                                arrow_type=arrow_type)
+            get_column_metadata(level, name=level.name, arrow_type=arrow_type)
             for i, (level, arrow_type) in enumerate(
                 zip(index_levels, index_types)
             )
@@ -317,7 +293,7 @@ def dataframe_to_arrays(df, schema, preserve_index, 
nthreads=1):
     for i, column in enumerate(index_columns):
         columns_to_convert.append(column)
         convert_types.append(None)
-        names.append(index_level_name(column, i))
+        names.append(index_level_name(i))
 
     # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
     # using a thread pool is worth it. Currently the heuristic is whether the
@@ -378,6 +354,7 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1):
     import pyarrow.lib as lib
 
     index_columns = []
+    columns = []
     column_indexes = []
     index_arrays = []
     index_names = []
@@ -390,6 +367,7 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1):
     if has_pandas_metadata:
         pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
         index_columns = pandas_metadata['index_columns']
+        columns = pandas_metadata['columns']
         column_indexes = pandas_metadata.get('column_indexes', [])
         table = _add_any_metadata(table, pandas_metadata)
 
@@ -397,11 +375,11 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1):
 
     # Build up a list of index columns and names while removing those columns
     # from the original table
-    for name in index_columns:
-        i = schema.get_field_index(name)
+    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
+    for raw_name, logical_name in zip(index_columns, logical_index_names):
+        i = schema.get_field_index(raw_name)
         if i != -1:
             col = table.column(i)
-            index_name = None if is_unnamed_index_level(name) else name
             col_pandas = col.to_pandas()
             values = col_pandas.values
             if not values.flags.writeable:
@@ -410,9 +388,9 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1):
                 values = values.copy()
 
             index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
-            index_names.append(index_name)
+            index_names.append(logical_name)
             block_table = block_table.remove_column(
-                block_table.schema.get_field_index(name)
+                block_table.schema.get_field_index(raw_name)
             )
 
     # Convert an arrow table to Block from the internal pandas API
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index a7fe98ce7..95dd6a471 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1171,7 +1171,8 @@ def test_dataset_read_pandas(tmpdir):
 
 
 @parquet
-def test_dataset_read_pandas_common_metadata(tmpdir):
+@pytest.mark.parametrize('preserve_index', [True, False])
+def test_dataset_read_pandas_common_metadata(tmpdir, preserve_index):
     # ARROW-1103
     import pyarrow.parquet as pq
 
@@ -1186,15 +1187,11 @@ def test_dataset_read_pandas_common_metadata(tmpdir):
     paths = []
     for i in range(nfiles):
         df = _test_dataframe(size, seed=i)
-        df.index = pd.Index(np.arange(i * size, (i + 1) * size))
-        df.index.name = 'index'
+        df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
 
-        path = pjoin(dirpath, '{0}.parquet'.format(i))
+        path = pjoin(dirpath, '{:d}.parquet'.format(i))
 
-        df_ex_index = df.reset_index(drop=True)
-        df_ex_index['index'] = df.index
-        table = pa.Table.from_pandas(df_ex_index,
-                                     preserve_index=False)
+        table = pa.Table.from_pandas(df, preserve_index=preserve_index)
 
         # Obliterate metadata
         table = table.replace_schema_metadata(None)
@@ -1206,7 +1203,9 @@ def test_dataset_read_pandas_common_metadata(tmpdir):
         paths.append(path)
 
     # Write _metadata common file
-    table_for_metadata = pa.Table.from_pandas(df)
+    table_for_metadata = pa.Table.from_pandas(
+        df, preserve_index=preserve_index
+    )
     pq.write_metadata(table_for_metadata.schema,
                       pjoin(dirpath, '_metadata'))
 
@@ -1214,7 +1213,7 @@ def test_dataset_read_pandas_common_metadata(tmpdir):
     columns = ['uint8', 'strings']
     result = dataset.read_pandas(columns=columns).to_pandas()
     expected = pd.concat([x[columns] for x in frames])
-
+    expected.index.name = df.index.name if preserve_index else None
     tm.assert_frame_equal(result, expected)
 
 
@@ -1387,3 +1386,27 @@ def test_large_table_int32_overflow():
     table = pa.Table.from_arrays([parr], names=['one'])
     f = io.BytesIO()
     _write_table(table, f)
+
+
+def test_index_column_name_duplicate(tmpdir):
+    data = {
+        'close': {
+            pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
+            pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
+        },
+        'time': {
+            pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
+                '2017-06-30 01:31:00'
+            ),
+            pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
+                '2017-06-30 01:32:00'
+            ),
+        }
+    }
+    path = str(tmpdir / 'data.parquet')
+    dfx = pd.DataFrame(data).set_index('time', drop=False)
+    tdfx = pa.Table.from_pandas(dfx)
+    _write_table(tdfx, path)
+    arrow_table = _read_table(path)
+    result_df = arrow_table.to_pandas()
+    tm.assert_frame_equal(result_df, dfx)


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Fix buggy Parquet roundtrip when an index name is the same as a 
> column name
> ------------------------------------------------------------------------------------
>
>                 Key: ARROW-1754
>                 URL: https://issues.apache.org/jira/browse/ARROW-1754
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.7.1
>            Reporter: Wes McKinney
>            Assignee: Phillip Cloud
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> See upstream report 
> https://stackoverflow.com/questions/47013052/issue-with-pyarrow-when-loading-parquet-file-where-index-has-redundant-column



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1754) [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name

Reply via email to