[arrow] branch master updated: ARROW-1976: [Python] Handling unicode pandas columns on parquet.read_table

wesm Mon, 05 Feb 2018 16:28:50 -0800

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 1a8c9a4  ARROW-1976: [Python] Handling unicode pandas columns on 
parquet.read_table
1a8c9a4 is described below

commit 1a8c9a414e23cb52178458ec6f62b1a88e47f4b5
Author: Phillip Cloud <[email protected]>
AuthorDate: Mon Feb 5 19:27:42 2018 -0500

    ARROW-1976: [Python] Handling unicode pandas columns on parquet.read_table
    
    Author: Phillip Cloud <[email protected]>
    Author: Licht-T <[email protected]>
    Author: Wes McKinney <[email protected]>
    Author: Simba Nyatsanga <[email protected]>
    
    Closes #1553 from cpcloud/ARROW-1976 and squashes the following commits:
    
    d8793f7b [Wes McKinney] Fix flakes
    77cd95ba [Phillip Cloud] No need for additional function call
    4f71b62c [Phillip Cloud] Make sure it's actually binary
    6f8ad84a [Phillip Cloud] Fix binary on python3
    e8d41545 [Phillip Cloud] Use _column_name_to_strings
    210607f2 [Phillip Cloud] Add tests
    40910cb0 [Phillip Cloud] Revert "Fix compat by using text_type"
    b098d8a4 [Phillip Cloud] Fix compat by using text_type
    a52f5c78 [Phillip Cloud] Revert
    8773fadf [Phillip Cloud] Ignore pytest cache
    db6176c1 [Simba Nyatsanga] Not using str with frombytes to ensure Python3 
tests pass.
    e9385c73 [Licht-T] BUG: Convert str by frombytes on pandas_compat.py
    17f28b1a [Licht-T] TST: Add tests for Pandas data SerDe with Unicode column 
names
    85c12315 [Licht-T] BUG: Fix Pandas data SerDe with Unicode column names in 
Python 2.7
---
 .gitignore                                  |  1 +
 python/pyarrow/pandas_compat.py             | 17 +++++++++-------
 python/pyarrow/parquet.py                   |  4 ++--
 python/pyarrow/tests/test_convert_pandas.py | 30 +++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index e6dfe19..c38694e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,4 @@ cpp/.idea/
 python/.eggs/
 .vscode
 .idea/
+.pytest_cache/
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 240cccd..987bb75 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -170,9 +170,11 @@ def get_column_metadata(column, name, arrow_type, 
field_name):
             )
         )
 
+    assert field_name is None or isinstance(field_name, six.string_types), \
+        str(type(field_name))
     return {
         'name': name,
-        'field_name': str(field_name),
+        'field_name': 'None' if field_name is None else field_name,
         'pandas_type': logical_type,
         'numpy_type': string_dtype,
         'metadata': extra_metadata,
@@ -279,8 +281,11 @@ def _column_name_to_strings(name):
     """
     if isinstance(name, six.string_types):
         return name
+    elif isinstance(name, six.binary_type):
+        # XXX: should we assume that bytes in Python 3 are UTF-8?
+        return name.decode('utf8')
     elif isinstance(name, tuple):
-        return tuple(map(_column_name_to_strings, name))
+        return str(tuple(map(_column_name_to_strings, name)))
     elif isinstance(name, collections.Sequence):
         raise TypeError("Unsupported type for MultiIndex level")
     elif name is None:
@@ -327,10 +332,7 @@ def dataframe_to_arrays(df, schema, preserve_index, 
nthreads=1):
 
     for name in df.columns:
         col = df[name]
-        if not isinstance(name, six.string_types):
-            name = _column_name_to_strings(name)
-            if name is not None:
-                name = str(name)
+        name = _column_name_to_strings(name)
 
         if schema is not None:
             field = schema.field_by_name(name)
@@ -561,7 +563,8 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1,
     column_strings = [x.name for x in block_table.itercolumns()]
     if columns:
         columns_name_dict = {
-            c.get('field_name', str(c['name'])): c['name'] for c in columns
+            c.get('field_name', _column_name_to_strings(c['name'])): c['name']
+            for c in columns
         }
         columns_values = [
             columns_name_dict.get(name, name) for name in column_strings
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 8820b6b..494e65e 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -939,7 +939,7 @@ def write_table(table, where, row_group_size=None, 
version='1.0',
                 coerce_timestamps=None,
                 flavor=None, **kwargs):
     row_group_size = kwargs.pop('chunk_size', row_group_size)
-
+    use_int96 = use_deprecated_int96_timestamps
     try:
         with ParquetWriter(
                 where, table.schema,
@@ -948,7 +948,7 @@ def write_table(table, where, row_group_size=None, 
version='1.0',
                 use_dictionary=use_dictionary,
                 coerce_timestamps=coerce_timestamps,
                 compression=compression,
-                use_deprecated_int96_timestamps= 
use_deprecated_int96_timestamps, # noqa
+                use_deprecated_int96_timestamps=use_int96,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 3109907..4f0a687 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -156,6 +156,11 @@ class TestPandasConversion(object):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
         _check_pandas_roundtrip(df, preserve_index=True)
 
+    def test_multiindex_columns_unicode(self):
+        columns = pd.MultiIndex.from_arrays([[u'あ', u'い'], ['X', 'Y']])
+        df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
+        _check_pandas_roundtrip(df, preserve_index=True)
+
     def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)
@@ -519,6 +524,31 @@ class TestPandasConversion(object):
 
         _check_pandas_roundtrip(df, expected_schema=schema)
 
+    def test_unicode_with_unicode_column_and_index(self):
+        df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
+
+        _check_pandas_roundtrip(df, preserve_index=True)
+
+    def test_mixed_unicode_column_names(self):
+        df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
+
+        # TODO(phillipc): Should this raise?
+        with pytest.raises(AssertionError):
+            _check_pandas_roundtrip(df, preserve_index=True)
+
+    def test_binary_column_name(self):
+        column_data = [u'い']
+        data = {u'あ'.encode('utf8'): column_data}
+        df = pd.DataFrame(data)
+
+        # we can't use _check_pandas_roundtrip here because our metdata
+        # is always decoded as utf8: even if binary goes in, utf8 comes out
+        t = pa.Table.from_pandas(df, preserve_index=True)
+        df2 = t.to_pandas()
+        assert df.values[0] == df2.values[0]
+        assert df.index.values[0] == df2.index.values[0]
+        assert df.columns[0] == df2.columns[0].encode('utf8')
+
     def test_bytes_to_binary(self):
         values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
         df = pd.DataFrame({'strings': values})

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[arrow] branch master updated: ARROW-1976: [Python] Handling unicode pandas columns on parquet.read_table

Reply via email to