Python] Pandas object conversion of ListType and ListType

kou Mon, 08 Oct 2018 04:28:25 -0700

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


commit b46236c73fa41b91d6368ce10930be65567fe386
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Sun Oct 7 13:28:18 2018 -0400

    ARROW-3225: [C++/Python] Pandas object conversion of ListType<DateType> and 
ListType<TimeType>
    
    Author: Krisztián Szűcs <[email protected]>
    
    Closes #2712 from kszucs/ARROW-3225 and squashes the following commits:
    
    2579605e5 <Krisztián Szűcs> check-format
    92ccd58c9 <Krisztián Szűcs> static casts
    4688de277 <Krisztián Szűcs> flake8
    9ee8f17a5 <Krisztián Szűcs> test compatibility
    f65e3d2f0 <Krisztián Szűcs> remove unused datetime function
    308b6b162 <Krisztián Szűcs> create example df with only parquet compatible 
types
    f16d2cd96 <Krisztián Szűcs> pandas roundtrip with list of time and list of 
date objects
---
 cpp/src/arrow/python/arrow_to_pandas.cc | 13 +++++-
 cpp/src/arrow/python/python_to_arrow.cc | 76 +++++++++++++++++++++++++++------
 cpp/src/arrow/python/util/datetime.h    | 18 +++++---
 python/pyarrow/tests/pandas_examples.py | 40 ++++++++++++++++-
 python/pyarrow/tests/test_parquet.py    | 17 ++++++--
 5 files changed, 141 insertions(+), 23 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 67510c4..f342a34 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -109,6 +109,10 @@ static inline bool ListTypeSupported(const DataType& type) 
{
     case Type::DOUBLE:
     case Type::BINARY:
     case Type::STRING:
+    case Type::DATE32:
+    case Type::DATE64:
+    case Type::TIME32:
+    case Type::TIME64:
     case Type::TIMESTAMP:
     case Type::NA:  // empty list
       // The above types are all supported.
@@ -791,6 +795,10 @@ class ObjectBlock : public PandasBlock {
         CONVERTLISTSLIKE_CASE(Int32Type, INT32)
         CONVERTLISTSLIKE_CASE(UInt64Type, UINT64)
         CONVERTLISTSLIKE_CASE(Int64Type, INT64)
+        CONVERTLISTSLIKE_CASE(Date32Type, DATE32)
+        CONVERTLISTSLIKE_CASE(Date64Type, DATE64)
+        CONVERTLISTSLIKE_CASE(Time32Type, TIME32)
+        CONVERTLISTSLIKE_CASE(Time64Type, TIME64)
         CONVERTLISTSLIKE_CASE(TimestampType, TIMESTAMP)
         CONVERTLISTSLIKE_CASE(FloatType, FLOAT)
         CONVERTLISTSLIKE_CASE(DoubleType, DOUBLE)
@@ -798,7 +806,6 @@ class ObjectBlock : public PandasBlock {
         CONVERTLISTSLIKE_CASE(StringType, STRING)
         CONVERTLISTSLIKE_CASE(ListType, LIST)
         CONVERTLISTSLIKE_CASE(NullType, NA)
-        // TODO(kszucs) Time and Date?
         default: {
           std::stringstream ss;
           ss << "Not implemented type for conversion from List to Pandas 
ObjectBlock: "
@@ -1869,6 +1876,10 @@ class ArrowDeserializer {
       CONVERTVALUES_LISTSLIKE_CASE(Int32Type, INT32)
       CONVERTVALUES_LISTSLIKE_CASE(UInt64Type, UINT64)
       CONVERTVALUES_LISTSLIKE_CASE(Int64Type, INT64)
+      CONVERTVALUES_LISTSLIKE_CASE(Date32Type, DATE32)
+      CONVERTVALUES_LISTSLIKE_CASE(Date64Type, DATE64)
+      CONVERTVALUES_LISTSLIKE_CASE(Time32Type, TIME32)
+      CONVERTVALUES_LISTSLIKE_CASE(Time64Type, TIME64)
       CONVERTVALUES_LISTSLIKE_CASE(TimestampType, TIMESTAMP)
       CONVERTVALUES_LISTSLIKE_CASE(FloatType, FLOAT)
       CONVERTVALUES_LISTSLIKE_CASE(DoubleType, DOUBLE)
diff --git a/cpp/src/arrow/python/python_to_arrow.cc 
b/cpp/src/arrow/python/python_to_arrow.cc
index e42ac51..a77cebc 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -291,7 +291,7 @@ class Date32Converter : public TypedConverter<Date32Type, 
Date32Converter> {
     int32_t t;
     if (PyDate_Check(obj)) {
       auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
-      t = static_cast<int32_t>(PyDate_to_s(pydate));
+      t = static_cast<int32_t>(PyDate_to_days(pydate));
     } else {
       RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for 
date32"));
     }
@@ -313,16 +313,61 @@ class Date64Converter : public TypedConverter<Date64Type, 
Date64Converter> {
   }
 };
 
-class TimeConverter : public TypedConverter<Time64Type, TimeConverter> {
+class Time32Converter : public TypedConverter<Time32Type, Time32Converter> {
  public:
+  explicit Time32Converter(TimeUnit::type unit) : unit_(unit) {}
+
+  Status AppendItem(PyObject* obj) {
+    // TODO(kszucs): option for strict conversion?
+    int32_t t;
+    if (PyTime_Check(obj)) {
+      // datetime.time stores microsecond resolution
+      switch (unit_) {
+        case TimeUnit::SECOND:
+          t = static_cast<int32_t>(PyTime_to_s(obj));
+          break;
+        case TimeUnit::MILLI:
+          t = static_cast<int32_t>(PyTime_to_ms(obj));
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+      return typed_builder_->Append(t);
+    } else {
+      return internal::InvalidValue(obj, "converting to time32");
+    }
+  }
+
+ private:
+  TimeUnit::type unit_;
+};
+
+class Time64Converter : public TypedConverter<Time64Type, Time64Converter> {
+ public:
+  explicit Time64Converter(TimeUnit::type unit) : unit_(unit) {}
+
   Status AppendItem(PyObject* obj) {
+    int64_t t;
     if (PyTime_Check(obj)) {
       // datetime.time stores microsecond resolution
-      return typed_builder_->Append(PyTime_to_us(obj));
+      switch (unit_) {
+        case TimeUnit::MICRO:
+          t = PyTime_to_us(obj);
+          break;
+        case TimeUnit::NANO:
+          t = PyTime_to_ns(obj);
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+      return typed_builder_->Append(t);
     } else {
       return internal::InvalidValue(obj, "converting to time64");
     }
   }
+
+ private:
+  TimeUnit::type unit_;
 };
 
 class TimestampConverter : public TypedConverter<TimestampType, 
TimestampConverter> {
@@ -647,6 +692,7 @@ Status ListConverter::AppendNdarrayItem(PyObject* obj) {
     LIST_FAST_CASE(INT64, NPY_INT64, Int64Type)
     LIST_SLOW_CASE(DATE32)
     LIST_SLOW_CASE(DATE64)
+    LIST_SLOW_CASE(TIME32)
     LIST_SLOW_CASE(TIME64)
     LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
     LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType)
@@ -826,11 +872,14 @@ Status GetConverter(const std::shared_ptr<DataType>& 
type, bool from_pandas,
     NUMERIC_CONVERTER(UINT16, UInt16Type);
     NUMERIC_CONVERTER(UINT32, UInt32Type);
     NUMERIC_CONVERTER(UINT64, UInt64Type);
-    SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
-    SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
     NUMERIC_CONVERTER(HALF_FLOAT, HalfFloatType);
     NUMERIC_CONVERTER(FLOAT, FloatType);
     NUMERIC_CONVERTER(DOUBLE, DoubleType);
+    SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter);
+    SIMPLE_CONVERTER_CASE(BINARY, BytesConverter);
+    SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter);
+    SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
+    SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
     case Type::STRING:
       if (strict_conversions) {
         *out = std::unique_ptr<SeqConverter>(new StringConverter<true>());
@@ -838,18 +887,21 @@ Status GetConverter(const std::shared_ptr<DataType>& 
type, bool from_pandas,
         *out = std::unique_ptr<SeqConverter>(new StringConverter<false>());
       }
       break;
-      SIMPLE_CONVERTER_CASE(BINARY, BytesConverter);
-      SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter);
+    case Type::TIME32: {
+      *out = std::unique_ptr<SeqConverter>(
+          new Time32Converter(checked_cast<const Time32Type&>(*type).unit()));
+      break;
+    }
+    case Type::TIME64: {
+      *out = std::unique_ptr<SeqConverter>(
+          new Time64Converter(checked_cast<const Time64Type&>(*type).unit()));
+      break;
+    }
     case Type::TIMESTAMP: {
       *out = std::unique_ptr<SeqConverter>(
           new TimestampConverter(checked_cast<const 
TimestampType&>(*type).unit()));
       break;
     }
-    case Type::TIME32: {
-      return Status::NotImplemented("No sequence converter for time32 
available");
-    }
-      SIMPLE_CONVERTER_CASE(TIME64, TimeConverter);
-      SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter);
     case Type::LIST:
       *out = std::unique_ptr<SeqConverter>(
           new ListConverter(from_pandas, strict_conversions));
diff --git a/cpp/src/arrow/python/util/datetime.h 
b/cpp/src/arrow/python/util/datetime.h
index d39178d..7350dea 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -163,6 +163,18 @@ static inline int64_t PyTime_to_us(PyObject* pytime) {
           PyDateTime_TIME_GET_MICROSECOND(pytime));
 }
 
+static inline int64_t PyTime_to_s(PyObject* pytime) {
+  return PyTime_to_us(pytime) / 1000000;
+}
+
+static inline int64_t PyTime_to_ms(PyObject* pytime) {
+  return PyTime_to_us(pytime) / 1000;
+}
+
+static inline int64_t PyTime_to_ns(PyObject* pytime) {
+  return PyTime_to_us(pytime) * 1000;
+}
+
 // Splitting time quantities, for example splitting total seconds into
 // minutes and remaining seconds. After we run
 // int64_t remaining = split_time(total, quotient, &next)
@@ -256,7 +268,7 @@ static inline Status PyDateTime_from_int(int64_t val, const 
TimeUnit::type unit,
   return Status::OK();
 }
 
-static inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+static inline int64_t PyDate_to_days(PyDateTime_Date* pydate) {
   return get_days_from_date(PyDateTime_GET_YEAR(pydate), 
PyDateTime_GET_MONTH(pydate),
                             PyDateTime_GET_DAY(pydate));
 }
@@ -293,10 +305,6 @@ static inline int64_t 
PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
   return PyDateTime_to_us(pydatetime) * 1000;
 }
 
-static inline int32_t PyDate_to_days(PyDateTime_Date* pydate) {
-  return static_cast<int32_t>(PyDate_to_ms(pydate) / 86400000LL);
-}
-
 }  // namespace py
 }  // namespace arrow
 
diff --git a/python/pyarrow/tests/pandas_examples.py 
b/python/pyarrow/tests/pandas_examples.py
index 65791d4..78a60f1 100644
--- a/python/pyarrow/tests/pandas_examples.py
+++ b/python/pyarrow/tests/pandas_examples.py
@@ -17,6 +17,7 @@
 # under the License.
 
 from collections import OrderedDict
+from datetime import date, time
 
 import numpy as np
 import pandas as pd
@@ -80,7 +81,7 @@ def dataframe_with_arrays(include_index=False):
     return df, schema
 
 
-def dataframe_with_lists(include_index=False):
+def dataframe_with_lists(include_index=False, parquet_compatible=False):
     """
     Dataframe with list columns of every possible primtive type.
 
@@ -89,6 +90,8 @@ def dataframe_with_lists(include_index=False):
     df: pandas.DataFrame
     schema: pyarrow.Schema
         Arrow schema definition that is in line with the constructed df.
+    parquet_compatible: bool
+        Exclude types not supported by parquet
     """
     arrays = OrderedDict()
     fields = []
@@ -127,8 +130,43 @@ def dataframe_with_lists(include_index=False):
         [],
     ]
 
+    date_data = [
+        [],
+        [date(2018, 1, 1), date(2032, 12, 30)],
+        [date(2000, 6, 7)],
+        None,
+        [date(1969, 6, 9), date(1972, 7, 3)]
+    ]
+    time_data = [
+        [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
+        [],
+        [time(22, 5, 59)],
+        None,
+        [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
+    ]
+
+    temporal_pairs = [
+        (pa.date32(), date_data),
+        (pa.date64(), date_data),
+        (pa.time32('s'), time_data),
+        (pa.time32('ms'), time_data),
+        (pa.time64('us'), time_data)
+    ]
+    if not parquet_compatible:
+        temporal_pairs += [
+            (pa.time64('ns'), time_data),
+        ]
+
+    for value_type, data in temporal_pairs:
+        field_name = '{}_list'.format(value_type)
+        field_type = pa.list_(value_type)
+        field = pa.field(field_name, field_type)
+        fields.append(field)
+        arrays[field_name] = data
+
     if include_index:
         fields.append(pa.field('__index_level_0__', pa.int64()))
+
     df = pd.DataFrame(arrays)
     schema = pa.schema(fields)
 
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index a3d8725..f3391ce 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -27,7 +27,7 @@ import pandas as pd
 import pandas.util.testing as tm
 
 import pyarrow as pa
-from pyarrow.compat import guid, u, BytesIO, unichar
+from pyarrow.compat import guid, u, BytesIO, unichar, PY2
 from pyarrow.tests import util
 from pyarrow.filesystem import LocalFileSystem
 from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
@@ -797,14 +797,23 @@ def test_coerce_timestamps_truncated(tempdir):
 
 
 def test_column_of_lists(tempdir):
-    df, schema = dataframe_with_lists()
+    df, schema = dataframe_with_lists(parquet_compatible=True)
 
     filename = tempdir / 'pandas_rountrip.parquet'
     arrow_table = pa.Table.from_pandas(df, schema=schema)
-    _write_table(arrow_table, filename, version='2.0',
-                 coerce_timestamps='ms')
+    _write_table(arrow_table, filename, version='2.0')
     table_read = _read_table(filename)
     df_read = table_read.to_pandas()
+
+    if PY2:
+        # assert_frame_equal fails when comparing datetime.date and
+        # np.datetime64, even with check_datetimelike_compat=True so
+        # convert the values to np.datetime64 instead
+        for col in ['date32[day]_list', 'date64[ms]_list']:
+            df[col] = df[col].apply(
+                lambda x: list(map(np.datetime64, x)) if x else x
+            )
+
     tm.assert_frame_equal(df, df_read)

[arrow] 13/15: ARROW-3225: [C++/Python] Pandas object conversion of ListType and ListType

Reply via email to