[jira] [Commented] (ARROW-1950) [Python] pandas_type in pandas metadata incorrect for List types

ASF GitHub Bot (JIRA) Wed, 07 Feb 2018 18:38:45 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1950?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16356401#comment-16356401
 ]


ASF GitHub Bot commented on ARROW-1950:
---------------------------------------

cpcloud closed pull request #1571: ARROW-1950: [Python] pandas_type in pandas 
metadata incorrect for List types
URL: https://github.com/apache/arrow/pull/1571
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index fcf05f833..a17d14bf6 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -56,8 +56,8 @@
 namespace arrow {
 namespace py {
 
-using internal::kPandasTimestampNull;
 using internal::kNanosecondsInDay;
+using internal::kPandasTimestampNull;
 
 using compute::Datum;
 
@@ -90,7 +90,6 @@ struct WrapBytes<FixedSizeBinaryArray> {
 
 static inline bool ListTypeSupported(const DataType& type) {
   switch (type.id()) {
-    case Type::NA:
     case Type::UINT8:
     case Type::INT8:
     case Type::UINT16:
@@ -104,6 +103,7 @@ static inline bool ListTypeSupported(const DataType& type) {
     case Type::BINARY:
     case Type::STRING:
     case Type::TIMESTAMP:
+    case Type::NA:  // empty list
       // The above types are all supported.
       return true;
     case Type::LIST: {
@@ -696,7 +696,6 @@ class ObjectBlock : public PandasBlock {
     } else if (type == Type::LIST) {
       auto list_type = std::static_pointer_cast<ListType>(col->type());
       switch (list_type->value_type()->id()) {
-        CONVERTLISTSLIKE_CASE(FloatType, NA)
         CONVERTLISTSLIKE_CASE(UInt8Type, UINT8)
         CONVERTLISTSLIKE_CASE(Int8Type, INT8)
         CONVERTLISTSLIKE_CASE(UInt16Type, UINT16)
@@ -711,6 +710,7 @@ class ObjectBlock : public PandasBlock {
         CONVERTLISTSLIKE_CASE(BinaryType, BINARY)
         CONVERTLISTSLIKE_CASE(StringType, STRING)
         CONVERTLISTSLIKE_CASE(ListType, LIST)
+        CONVERTLISTSLIKE_CASE(NullType, NA)
         default: {
           std::stringstream ss;
           ss << "Not implemented type for conversion from List to Pandas 
ObjectBlock: "
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 987bb7555..f5e56a9b2 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -45,7 +45,7 @@ def get_logical_type_map():
 
     if not _logical_type_map:
         _logical_type_map.update({
-            pa.lib.Type_NA: 'float64',  # NaNs
+            pa.lib.Type_NA: 'empty',
             pa.lib.Type_BOOL: 'bool',
             pa.lib.Type_INT8: 'int8',
             pa.lib.Type_INT16: 'int16',
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 1d5d30071..efbcef5e1 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -455,7 +455,7 @@ def test_simple_type_construction():
 @pytest.mark.parametrize(
     ('type', 'expected'),
     [
-        (pa.null(), 'float64'),
+        (pa.null(), 'empty'),
         (pa.bool_(), 'bool'),
         (pa.int8(), 'int8'),
         (pa.int16(), 'int16'),
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 4f0a68729..7dbf0d7ed 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1404,6 +1404,57 @@ def test_empty_list_roundtrip(self):
 
         tm.assert_frame_equal(result, df)
 
+    def test_empty_list_metadata(self):
+        # Create table with array of empty lists, forced to have type
+        # list(string) in pyarrow
+        c1 = [["test"], ["a", "b"], None]
+        c2 = [[], [], []]
+        arrays = OrderedDict([
+            ('c1', pa.array(c1, type=pa.list_(pa.string()))),
+            ('c2', pa.array(c2, type=pa.list_(pa.string()))),
+        ])
+        rb = pa.RecordBatch.from_arrays(
+            list(arrays.values()),
+            list(arrays.keys())
+        )
+        tbl = pa.Table.from_batches([rb])
+
+        # First roundtrip changes schema, because pandas cannot preserve the
+        # type of empty lists
+        df = tbl.to_pandas()
+        tbl2 = pa.Table.from_pandas(df, preserve_index=True)
+        md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8'))
+
+        # Second roundtrip
+        df2 = tbl2.to_pandas()
+        expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))
+
+        tm.assert_frame_equal(df2, expected)
+
+        assert md2['columns'] == [
+            {
+                'name': 'c1',
+                'field_name': 'c1',
+                'metadata': None,
+                'numpy_type': 'object',
+                'pandas_type': 'list[unicode]',
+            },
+            {
+                'name': 'c2',
+                'field_name': 'c2',
+                'metadata': None,
+                'numpy_type': 'object',
+                'pandas_type': 'list[empty]',
+            },
+            {
+                'name': None,
+                'field_name': '__index_level_0__',
+                'metadata': None,
+                'numpy_type': 'int64',
+                'pandas_type': 'int64',
+            }
+        ]
+
 
 def _fully_loaded_dataframe_example():
     from distutils.version import LooseVersion


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] pandas_type in pandas metadata incorrect for List types
> ----------------------------------------------------------------
>
>                 Key: ARROW-1950
>                 URL: https://issues.apache.org/jira/browse/ARROW-1950
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Phillip Cloud
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> see https://github.com/pandas-dev/pandas/pull/18201#issuecomment-353042438



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-1950) [Python] pandas_type in pandas metadata incorrect for List types

Reply via email to