This is an automated email from the ASF dual-hosted git repository.
cpcloud pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7bf7b2e ARROW-1950: [Python] pandas_type in pandas metadata incorrect
for List types
7bf7b2e is described below
commit 7bf7b2e9639a3df10d9de76df6c705e4495e9e75
Author: Phillip Cloud <[email protected]>
AuthorDate: Wed Feb 7 21:37:34 2018 -0500
ARROW-1950: [Python] pandas_type in pandas metadata incorrect for List types
Author: Phillip Cloud <[email protected]>
Closes #1571 from cpcloud/ARROW-1950 and squashes the following commits:
0b6bc1cb [Phillip Cloud] ARROW-1950: [Python] pandas_type in pandas
metadata incorrect for List types
---
cpp/src/arrow/python/arrow_to_pandas.cc | 6 ++--
python/pyarrow/pandas_compat.py | 2 +-
python/pyarrow/tests/test_array.py | 2 +-
python/pyarrow/tests/test_convert_pandas.py | 51 +++++++++++++++++++++++++++++
4 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc
b/cpp/src/arrow/python/arrow_to_pandas.cc
index fcf05f8..a17d14b 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -56,8 +56,8 @@
namespace arrow {
namespace py {
-using internal::kPandasTimestampNull;
using internal::kNanosecondsInDay;
+using internal::kPandasTimestampNull;
using compute::Datum;
@@ -90,7 +90,6 @@ struct WrapBytes<FixedSizeBinaryArray> {
static inline bool ListTypeSupported(const DataType& type) {
switch (type.id()) {
- case Type::NA:
case Type::UINT8:
case Type::INT8:
case Type::UINT16:
@@ -104,6 +103,7 @@ static inline bool ListTypeSupported(const DataType& type) {
case Type::BINARY:
case Type::STRING:
case Type::TIMESTAMP:
+ case Type::NA: // empty list
// The above types are all supported.
return true;
case Type::LIST: {
@@ -696,7 +696,6 @@ class ObjectBlock : public PandasBlock {
} else if (type == Type::LIST) {
auto list_type = std::static_pointer_cast<ListType>(col->type());
switch (list_type->value_type()->id()) {
- CONVERTLISTSLIKE_CASE(FloatType, NA)
CONVERTLISTSLIKE_CASE(UInt8Type, UINT8)
CONVERTLISTSLIKE_CASE(Int8Type, INT8)
CONVERTLISTSLIKE_CASE(UInt16Type, UINT16)
@@ -711,6 +710,7 @@ class ObjectBlock : public PandasBlock {
CONVERTLISTSLIKE_CASE(BinaryType, BINARY)
CONVERTLISTSLIKE_CASE(StringType, STRING)
CONVERTLISTSLIKE_CASE(ListType, LIST)
+ CONVERTLISTSLIKE_CASE(NullType, NA)
default: {
std::stringstream ss;
ss << "Not implemented type for conversion from List to Pandas
ObjectBlock: "
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 987bb75..f5e56a9 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -45,7 +45,7 @@ def get_logical_type_map():
if not _logical_type_map:
_logical_type_map.update({
- pa.lib.Type_NA: 'float64', # NaNs
+ pa.lib.Type_NA: 'empty',
pa.lib.Type_BOOL: 'bool',
pa.lib.Type_INT8: 'int8',
pa.lib.Type_INT16: 'int16',
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index 1d5d300..efbcef5 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -455,7 +455,7 @@ def test_simple_type_construction():
@pytest.mark.parametrize(
('type', 'expected'),
[
- (pa.null(), 'float64'),
+ (pa.null(), 'empty'),
(pa.bool_(), 'bool'),
(pa.int8(), 'int8'),
(pa.int16(), 'int16'),
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 4f0a687..7dbf0d7 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1404,6 +1404,57 @@ class TestPandasConversion(object):
tm.assert_frame_equal(result, df)
+ def test_empty_list_metadata(self):
+ # Create table with array of empty lists, forced to have type
+ # list(string) in pyarrow
+ c1 = [["test"], ["a", "b"], None]
+ c2 = [[], [], []]
+ arrays = OrderedDict([
+ ('c1', pa.array(c1, type=pa.list_(pa.string()))),
+ ('c2', pa.array(c2, type=pa.list_(pa.string()))),
+ ])
+ rb = pa.RecordBatch.from_arrays(
+ list(arrays.values()),
+ list(arrays.keys())
+ )
+ tbl = pa.Table.from_batches([rb])
+
+ # First roundtrip changes schema, because pandas cannot preserve the
+ # type of empty lists
+ df = tbl.to_pandas()
+ tbl2 = pa.Table.from_pandas(df, preserve_index=True)
+ md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8'))
+
+ # Second roundtrip
+ df2 = tbl2.to_pandas()
+ expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))
+
+ tm.assert_frame_equal(df2, expected)
+
+ assert md2['columns'] == [
+ {
+ 'name': 'c1',
+ 'field_name': 'c1',
+ 'metadata': None,
+ 'numpy_type': 'object',
+ 'pandas_type': 'list[unicode]',
+ },
+ {
+ 'name': 'c2',
+ 'field_name': 'c2',
+ 'metadata': None,
+ 'numpy_type': 'object',
+ 'pandas_type': 'list[empty]',
+ },
+ {
+ 'name': None,
+ 'field_name': '__index_level_0__',
+ 'metadata': None,
+ 'numpy_type': 'int64',
+ 'pandas_type': 'int64',
+ }
+ ]
+
def _fully_loaded_dataframe_example():
from distutils.version import LooseVersion
--
To stop receiving notification emails like this one, please contact
[email protected].