[arrow] branch master updated: ARROW-4538: [Python] Remove index column from subschema in write_to_dataframe

uwe Wed, 06 Mar 2019 03:18:53 -0800

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new d05efe6  ARROW-4538: [Python] Remove index column from subschema in 
write_to_dataframe
d05efe6 is described below

commit d05efe680f4ea60d164ccda641451db06c9d20d7
Author: Christian Thiel <christian.th...@benteler.com>
AuthorDate: Wed Mar 6 12:17:04 2019 +0100

    ARROW-4538: [Python] Remove index column from subschema in 
write_to_dataframe
    
    I noticed that `parquet.write_to_dataset` failes if the index name is not 
None. This is due to `write_to_dataframe` only removing the old `__index_level` 
prefixed column names.
    
    In this PR I determined the dataframe index names from the metadata just 
like in `pyarrow.pandas_compat.table_to_blockmanager`. I believe that this is a 
good thing to do, as the extra trip via pandas in `write_to_dataframe` will 
hopefully be superfluous in the future.
    
    This is my first contribution to arrow (appart from some issues), so please 
let me know if this is what you're looking for.
    
    Author: Christian Thiel <christian.th...@benteler.com>
    
    Closes #3744 from c-thiel/master and squashes the following commits:
    
    a66b5f85 <Christian Thiel> ARROW-4538:  Remove index columns from subschema 
in write_to_dataframe
---
 python/pyarrow/parquet.py            | 10 +++++++++-
 python/pyarrow/tests/test_parquet.py | 18 ++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index f5a98eb..fe602bb 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -1249,10 +1249,18 @@ def write_to_dataset(table, root_path, 
partition_cols=None,
             raise ValueError('No data left to save outside partition columns')
 
         subschema = table.schema
+        # ARROW-4538: Remove index column from subschema in write_to_dataframe
+        metadata = subschema.metadata
+        has_pandas_metadata = (metadata is not None and b'pandas' in metadata)
+        index_columns = []
+        if has_pandas_metadata:
+            pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
+            index_columns = pandas_metadata['index_columns']
         # ARROW-2891: Ensure the output_schema is preserved when writing a
         # partitioned dataset
         for col in table.schema.names:
-            if (col.startswith('__index_level_') or col in partition_cols):
+            if (col.startswith('__index_level_') or col in partition_cols or
+                    col in index_columns):
                 subschema = subschema.remove(subschema.get_field_index(col))
 
         for keys, subgroup in data_df.groupby(partition_keys):
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index f4ced93..77b9ead 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1900,7 +1900,8 @@ def test_read_table_doesnt_warn(datadir):
 
 def _test_write_to_dataset_with_partitions(base_path,
                                            filesystem=None,
-                                           schema=None):
+                                           schema=None,
+                                           index_name=None):
     # ARROW-1400
     output_df = pd.DataFrame({'group1': list('aaabbbbccc'),
                               'group2': list('eefeffgeee'),
@@ -1908,9 +1909,13 @@ def _test_write_to_dataset_with_partitions(base_path,
                               'nan': [pd.np.nan] * 10,
                               'date': np.arange('2017-01-01', '2017-01-11',
                                                 dtype='datetime64[D]')})
+    # ARROW-4538
+    output_df.index.name = index_name
+
     cols = output_df.columns.tolist()
     partition_by = ['group1', 'group2']
-    output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False)
+    output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False,
+                                        preserve_index=True)
     pq.write_to_dataset(output_table, base_path, partition_by,
                         filesystem=filesystem)
 
@@ -1931,6 +1936,10 @@ def _test_write_to_dataset_with_partitions(base_path,
     dataset_cols = set(dataset.schema.to_arrow_schema().names)
     assert dataset_cols == set(output_table.schema.names)
 
+    # ARROW-4538
+    if index_name is not None:
+        assert index_name in dataset_cols
+
     input_table = dataset.read()
     input_df = input_table.to_pandas()
 
@@ -1991,6 +2000,11 @@ def 
test_write_to_dataset_with_partitions_and_schema(tempdir):
     _test_write_to_dataset_with_partitions(str(tempdir), schema=schema)
 
 
+def test_write_to_dataset_with_partitions_and_index_name(tempdir):
+    _test_write_to_dataset_with_partitions(str(tempdir),
+                                           index_name='index_name')
+
+
 def test_write_to_dataset_no_partitions(tempdir):
     _test_write_to_dataset_no_partitions(str(tempdir))

[arrow] branch master updated: ARROW-4538: [Python] Remove index column from subschema in write_to_dataframe

Reply via email to