This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1a8c9a4 ARROW-1976: [Python] Handling unicode pandas columns on
parquet.read_table
1a8c9a4 is described below
commit 1a8c9a414e23cb52178458ec6f62b1a88e47f4b5
Author: Phillip Cloud <[email protected]>
AuthorDate: Mon Feb 5 19:27:42 2018 -0500
ARROW-1976: [Python] Handling unicode pandas columns on parquet.read_table
Author: Phillip Cloud <[email protected]>
Author: Licht-T <[email protected]>
Author: Wes McKinney <[email protected]>
Author: Simba Nyatsanga <[email protected]>
Closes #1553 from cpcloud/ARROW-1976 and squashes the following commits:
d8793f7b [Wes McKinney] Fix flakes
77cd95ba [Phillip Cloud] No need for additional function call
4f71b62c [Phillip Cloud] Make sure it's actually binary
6f8ad84a [Phillip Cloud] Fix binary on python3
e8d41545 [Phillip Cloud] Use _column_name_to_strings
210607f2 [Phillip Cloud] Add tests
40910cb0 [Phillip Cloud] Revert "Fix compat by using text_type"
b098d8a4 [Phillip Cloud] Fix compat by using text_type
a52f5c78 [Phillip Cloud] Revert
8773fadf [Phillip Cloud] Ignore pytest cache
db6176c1 [Simba Nyatsanga] Not using str with frombytes to ensure Python3
tests pass.
e9385c73 [Licht-T] BUG: Convert str by frombytes on pandas_compat.py
17f28b1a [Licht-T] TST: Add tests for Pandas data SerDe with Unicode column
names
85c12315 [Licht-T] BUG: Fix Pandas data SerDe with Unicode column names in
Python 2.7
---
.gitignore | 1 +
python/pyarrow/pandas_compat.py | 17 +++++++++-------
python/pyarrow/parquet.py | 4 ++--
python/pyarrow/tests/test_convert_pandas.py | 30 +++++++++++++++++++++++++++++
4 files changed, 43 insertions(+), 9 deletions(-)
diff --git a/.gitignore b/.gitignore
index e6dfe19..c38694e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,4 @@ cpp/.idea/
python/.eggs/
.vscode
.idea/
+.pytest_cache/
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 240cccd..987bb75 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -170,9 +170,11 @@ def get_column_metadata(column, name, arrow_type,
field_name):
)
)
+ assert field_name is None or isinstance(field_name, six.string_types), \
+ str(type(field_name))
return {
'name': name,
- 'field_name': str(field_name),
+ 'field_name': 'None' if field_name is None else field_name,
'pandas_type': logical_type,
'numpy_type': string_dtype,
'metadata': extra_metadata,
@@ -279,8 +281,11 @@ def _column_name_to_strings(name):
"""
if isinstance(name, six.string_types):
return name
+ elif isinstance(name, six.binary_type):
+ # XXX: should we assume that bytes in Python 3 are UTF-8?
+ return name.decode('utf8')
elif isinstance(name, tuple):
- return tuple(map(_column_name_to_strings, name))
+ return str(tuple(map(_column_name_to_strings, name)))
elif isinstance(name, collections.Sequence):
raise TypeError("Unsupported type for MultiIndex level")
elif name is None:
@@ -327,10 +332,7 @@ def dataframe_to_arrays(df, schema, preserve_index,
nthreads=1):
for name in df.columns:
col = df[name]
- if not isinstance(name, six.string_types):
- name = _column_name_to_strings(name)
- if name is not None:
- name = str(name)
+ name = _column_name_to_strings(name)
if schema is not None:
field = schema.field_by_name(name)
@@ -561,7 +563,8 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1,
column_strings = [x.name for x in block_table.itercolumns()]
if columns:
columns_name_dict = {
- c.get('field_name', str(c['name'])): c['name'] for c in columns
+ c.get('field_name', _column_name_to_strings(c['name'])): c['name']
+ for c in columns
}
columns_values = [
columns_name_dict.get(name, name) for name in column_strings
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 8820b6b..494e65e 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -939,7 +939,7 @@ def write_table(table, where, row_group_size=None,
version='1.0',
coerce_timestamps=None,
flavor=None, **kwargs):
row_group_size = kwargs.pop('chunk_size', row_group_size)
-
+ use_int96 = use_deprecated_int96_timestamps
try:
with ParquetWriter(
where, table.schema,
@@ -948,7 +948,7 @@ def write_table(table, where, row_group_size=None,
version='1.0',
use_dictionary=use_dictionary,
coerce_timestamps=coerce_timestamps,
compression=compression,
- use_deprecated_int96_timestamps=
use_deprecated_int96_timestamps, # noqa
+ use_deprecated_int96_timestamps=use_int96,
**kwargs) as writer:
writer.write_table(table, row_group_size=row_group_size)
except Exception:
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 3109907..4f0a687 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -156,6 +156,11 @@ class TestPandasConversion(object):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
_check_pandas_roundtrip(df, preserve_index=True)
+ def test_multiindex_columns_unicode(self):
+ columns = pd.MultiIndex.from_arrays([[u'あ', u'い'], ['X', 'Y']])
+ df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
+ _check_pandas_roundtrip(df, preserve_index=True)
+
def test_integer_index_column(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
_check_pandas_roundtrip(df, preserve_index=True)
@@ -519,6 +524,31 @@ class TestPandasConversion(object):
_check_pandas_roundtrip(df, expected_schema=schema)
+ def test_unicode_with_unicode_column_and_index(self):
+ df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
+
+ _check_pandas_roundtrip(df, preserve_index=True)
+
+ def test_mixed_unicode_column_names(self):
+ df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
+
+ # TODO(phillipc): Should this raise?
+ with pytest.raises(AssertionError):
+ _check_pandas_roundtrip(df, preserve_index=True)
+
+ def test_binary_column_name(self):
+ column_data = [u'い']
+ data = {u'あ'.encode('utf8'): column_data}
+ df = pd.DataFrame(data)
+
+ # we can't use _check_pandas_roundtrip here because our metdata
+ # is always decoded as utf8: even if binary goes in, utf8 comes out
+ t = pa.Table.from_pandas(df, preserve_index=True)
+ df2 = t.to_pandas()
+ assert df.values[0] == df2.values[0]
+ assert df.index.values[0] == df2.index.values[0]
+ assert df.columns[0] == df2.columns[0].encode('utf8')
+
def test_bytes_to_binary(self):
values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
df = pd.DataFrame({'strings': values})
--
To stop receiving notification emails like this one, please contact
[email protected].