This is an automated email from the ASF dual-hosted git repository. uwe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new d05efe6 ARROW-4538: [Python] Remove index column from subschema in write_to_dataframe d05efe6 is described below commit d05efe680f4ea60d164ccda641451db06c9d20d7 Author: Christian Thiel <christian.th...@benteler.com> AuthorDate: Wed Mar 6 12:17:04 2019 +0100 ARROW-4538: [Python] Remove index column from subschema in write_to_dataframe I noticed that `parquet.write_to_dataset` failes if the index name is not None. This is due to `write_to_dataframe` only removing the old `__index_level` prefixed column names. In this PR I determined the dataframe index names from the metadata just like in `pyarrow.pandas_compat.table_to_blockmanager`. I believe that this is a good thing to do, as the extra trip via pandas in `write_to_dataframe` will hopefully be superfluous in the future. This is my first contribution to arrow (appart from some issues), so please let me know if this is what you're looking for. Author: Christian Thiel <christian.th...@benteler.com> Closes #3744 from c-thiel/master and squashes the following commits: a66b5f85 <Christian Thiel> ARROW-4538: Remove index columns from subschema in write_to_dataframe --- python/pyarrow/parquet.py | 10 +++++++++- python/pyarrow/tests/test_parquet.py | 18 ++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index f5a98eb..fe602bb 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1249,10 +1249,18 @@ def write_to_dataset(table, root_path, partition_cols=None, raise ValueError('No data left to save outside partition columns') subschema = table.schema + # ARROW-4538: Remove index column from subschema in write_to_dataframe + metadata = subschema.metadata + has_pandas_metadata = (metadata is not None and b'pandas' in metadata) + index_columns = [] + if has_pandas_metadata: + pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) + index_columns = pandas_metadata['index_columns'] # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset for col in table.schema.names: - if (col.startswith('__index_level_') or col in partition_cols): + if (col.startswith('__index_level_') or col in partition_cols or + col in index_columns): subschema = subschema.remove(subschema.get_field_index(col)) for keys, subgroup in data_df.groupby(partition_keys): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f4ced93..77b9ead 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1900,7 +1900,8 @@ def test_read_table_doesnt_warn(datadir): def _test_write_to_dataset_with_partitions(base_path, filesystem=None, - schema=None): + schema=None, + index_name=None): # ARROW-1400 output_df = pd.DataFrame({'group1': list('aaabbbbccc'), 'group2': list('eefeffgeee'), @@ -1908,9 +1909,13 @@ def _test_write_to_dataset_with_partitions(base_path, 'nan': [pd.np.nan] * 10, 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]')}) + # ARROW-4538 + output_df.index.name = index_name + cols = output_df.columns.tolist() partition_by = ['group1', 'group2'] - output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False) + output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False, + preserve_index=True) pq.write_to_dataset(output_table, base_path, partition_by, filesystem=filesystem) @@ -1931,6 +1936,10 @@ def _test_write_to_dataset_with_partitions(base_path, dataset_cols = set(dataset.schema.to_arrow_schema().names) assert dataset_cols == set(output_table.schema.names) + # ARROW-4538 + if index_name is not None: + assert index_name in dataset_cols + input_table = dataset.read() input_df = input_table.to_pandas() @@ -1991,6 +2000,11 @@ def test_write_to_dataset_with_partitions_and_schema(tempdir): _test_write_to_dataset_with_partitions(str(tempdir), schema=schema) +def test_write_to_dataset_with_partitions_and_index_name(tempdir): + _test_write_to_dataset_with_partitions(str(tempdir), + index_name='index_name') + + def test_write_to_dataset_no_partitions(tempdir): _test_write_to_dataset_no_partitions(str(tempdir))