[arrow] branch master updated: ARROW-1950: [Python] pandas_type in pandas metadata incorrect for List types

cpcloud Wed, 07 Feb 2018 18:38:10 -0800

This is an automated email from the ASF dual-hosted git repository.

cpcloud pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 7bf7b2e  ARROW-1950: [Python] pandas_type in pandas metadata incorrect 
for List types
7bf7b2e is described below

commit 7bf7b2e9639a3df10d9de76df6c705e4495e9e75
Author: Phillip Cloud <[email protected]>
AuthorDate: Wed Feb 7 21:37:34 2018 -0500

    ARROW-1950: [Python] pandas_type in pandas metadata incorrect for List types
    
    Author: Phillip Cloud <[email protected]>
    
    Closes #1571 from cpcloud/ARROW-1950 and squashes the following commits:
    
    0b6bc1cb [Phillip Cloud] ARROW-1950: [Python] pandas_type in pandas 
metadata incorrect for List types
---
 cpp/src/arrow/python/arrow_to_pandas.cc     |  6 ++--
 python/pyarrow/pandas_compat.py             |  2 +-
 python/pyarrow/tests/test_array.py          |  2 +-
 python/pyarrow/tests/test_convert_pandas.py | 51 +++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index fcf05f8..a17d14b 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -56,8 +56,8 @@
 namespace arrow {
 namespace py {
 
-using internal::kPandasTimestampNull;
 using internal::kNanosecondsInDay;
+using internal::kPandasTimestampNull;
 
 using compute::Datum;
 
@@ -90,7 +90,6 @@ struct WrapBytes<FixedSizeBinaryArray> {
 
 static inline bool ListTypeSupported(const DataType& type) {
   switch (type.id()) {
-    case Type::NA:
     case Type::UINT8:
     case Type::INT8:
     case Type::UINT16:
@@ -104,6 +103,7 @@ static inline bool ListTypeSupported(const DataType& type) {
     case Type::BINARY:
     case Type::STRING:
     case Type::TIMESTAMP:
+    case Type::NA:  // empty list
       // The above types are all supported.
       return true;
     case Type::LIST: {
@@ -696,7 +696,6 @@ class ObjectBlock : public PandasBlock {
     } else if (type == Type::LIST) {
       auto list_type = std::static_pointer_cast<ListType>(col->type());
       switch (list_type->value_type()->id()) {
-        CONVERTLISTSLIKE_CASE(FloatType, NA)
         CONVERTLISTSLIKE_CASE(UInt8Type, UINT8)
         CONVERTLISTSLIKE_CASE(Int8Type, INT8)
         CONVERTLISTSLIKE_CASE(UInt16Type, UINT16)
@@ -711,6 +710,7 @@ class ObjectBlock : public PandasBlock {
         CONVERTLISTSLIKE_CASE(BinaryType, BINARY)
         CONVERTLISTSLIKE_CASE(StringType, STRING)
         CONVERTLISTSLIKE_CASE(ListType, LIST)
+        CONVERTLISTSLIKE_CASE(NullType, NA)
         default: {
           std::stringstream ss;
           ss << "Not implemented type for conversion from List to Pandas 
ObjectBlock: "
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 987bb75..f5e56a9 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -45,7 +45,7 @@ def get_logical_type_map():
 
     if not _logical_type_map:
         _logical_type_map.update({
-            pa.lib.Type_NA: 'float64',  # NaNs
+            pa.lib.Type_NA: 'empty',
             pa.lib.Type_BOOL: 'bool',
             pa.lib.Type_INT8: 'int8',
             pa.lib.Type_INT16: 'int16',
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 1d5d300..efbcef5 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -455,7 +455,7 @@ def test_simple_type_construction():
 @pytest.mark.parametrize(
     ('type', 'expected'),
     [
-        (pa.null(), 'float64'),
+        (pa.null(), 'empty'),
         (pa.bool_(), 'bool'),
         (pa.int8(), 'int8'),
         (pa.int16(), 'int16'),
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 4f0a687..7dbf0d7 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1404,6 +1404,57 @@ class TestPandasConversion(object):
 
         tm.assert_frame_equal(result, df)
 
+    def test_empty_list_metadata(self):
+        # Create table with array of empty lists, forced to have type
+        # list(string) in pyarrow
+        c1 = [["test"], ["a", "b"], None]
+        c2 = [[], [], []]
+        arrays = OrderedDict([
+            ('c1', pa.array(c1, type=pa.list_(pa.string()))),
+            ('c2', pa.array(c2, type=pa.list_(pa.string()))),
+        ])
+        rb = pa.RecordBatch.from_arrays(
+            list(arrays.values()),
+            list(arrays.keys())
+        )
+        tbl = pa.Table.from_batches([rb])
+
+        # First roundtrip changes schema, because pandas cannot preserve the
+        # type of empty lists
+        df = tbl.to_pandas()
+        tbl2 = pa.Table.from_pandas(df, preserve_index=True)
+        md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8'))
+
+        # Second roundtrip
+        df2 = tbl2.to_pandas()
+        expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))
+
+        tm.assert_frame_equal(df2, expected)
+
+        assert md2['columns'] == [
+            {
+                'name': 'c1',
+                'field_name': 'c1',
+                'metadata': None,
+                'numpy_type': 'object',
+                'pandas_type': 'list[unicode]',
+            },
+            {
+                'name': 'c2',
+                'field_name': 'c2',
+                'metadata': None,
+                'numpy_type': 'object',
+                'pandas_type': 'list[empty]',
+            },
+            {
+                'name': None,
+                'field_name': '__index_level_0__',
+                'metadata': None,
+                'numpy_type': 'int64',
+                'pandas_type': 'int64',
+            }
+        ]
+
 
 def _fully_loaded_dataframe_example():
     from distutils.version import LooseVersion

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[arrow] branch master updated: ARROW-1950: [Python] pandas_type in pandas metadata incorrect for List types

Reply via email to