This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7ee27ac ARROW-3225: [C++/Python] Pandas object conversion of
ListType<DateType> and ListType<TimeType>
7ee27ac is described below
commit 7ee27ac98cfb1aa8031729d823732968b2289f5a
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Sun Oct 7 13:28:18 2018 -0400
ARROW-3225: [C++/Python] Pandas object conversion of ListType<DateType> and
ListType<TimeType>
Author: Krisztián Szűcs <[email protected]>
Closes #2712 from kszucs/ARROW-3225 and squashes the following commits:
2579605e5 <Krisztián Szűcs> check-format
92ccd58c9 <Krisztián Szűcs> static casts
4688de277 <Krisztián Szűcs> flake8
9ee8f17a5 <Krisztián Szűcs> test compatibility
f65e3d2f0 <Krisztián Szűcs> remove unused datetime function
308b6b162 <Krisztián Szűcs> create example df with only parquet compatible
types
f16d2cd96 <Krisztián Szűcs> pandas roundtrip with list of time and list of
date objects
---
cpp/src/arrow/python/arrow_to_pandas.cc | 13 +++++-
cpp/src/arrow/python/python_to_arrow.cc | 76 +++++++++++++++++++++++++++------
cpp/src/arrow/python/util/datetime.h | 18 +++++---
python/pyarrow/tests/pandas_examples.py | 40 ++++++++++++++++-
python/pyarrow/tests/test_parquet.py | 17 ++++++--
5 files changed, 141 insertions(+), 23 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 67510c4..f342a34 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -109,6 +109,10 @@ static inline bool ListTypeSupported(const DataType& type)
{
case Type::DOUBLE:
case Type::BINARY:
case Type::STRING:
+ case Type::DATE32:
+ case Type::DATE64:
+ case Type::TIME32:
+ case Type::TIME64:
case Type::TIMESTAMP:
case Type::NA: // empty list
// The above types are all supported.
@@ -791,6 +795,10 @@ class ObjectBlock : public PandasBlock {
CONVERTLISTSLIKE_CASE(Int32Type, INT32)
CONVERTLISTSLIKE_CASE(UInt64Type, UINT64)
CONVERTLISTSLIKE_CASE(Int64Type, INT64)
+ CONVERTLISTSLIKE_CASE(Date32Type, DATE32)
+ CONVERTLISTSLIKE_CASE(Date64Type, DATE64)
+ CONVERTLISTSLIKE_CASE(Time32Type, TIME32)
+ CONVERTLISTSLIKE_CASE(Time64Type, TIME64)
CONVERTLISTSLIKE_CASE(TimestampType, TIMESTAMP)
CONVERTLISTSLIKE_CASE(FloatType, FLOAT)
CONVERTLISTSLIKE_CASE(DoubleType, DOUBLE)
@@ -798,7 +806,6 @@ class ObjectBlock : public PandasBlock {
CONVERTLISTSLIKE_CASE(StringType, STRING)
CONVERTLISTSLIKE_CASE(ListType, LIST)
CONVERTLISTSLIKE_CASE(NullType, NA)
- // TODO(kszucs) Time and Date?
default: {
std::stringstream ss;
ss << "Not implemented type for conversion from List to Pandas
ObjectBlock: "
@@ -1869,6 +1876,10 @@ class ArrowDeserializer {
CONVERTVALUES_LISTSLIKE_CASE(Int32Type, INT32)
CONVERTVALUES_LISTSLIKE_CASE(UInt64Type, UINT64)
CONVERTVALUES_LISTSLIKE_CASE(Int64Type, INT64)
+ CONVERTVALUES_LISTSLIKE_CASE(Date32Type, DATE32)
+ CONVERTVALUES_LISTSLIKE_CASE(Date64Type, DATE64)
+ CONVERTVALUES_LISTSLIKE_CASE(Time32Type, TIME32)
+ CONVERTVALUES_LISTSLIKE_CASE(Time64Type, TIME64)
CONVERTVALUES_LISTSLIKE_CASE(TimestampType, TIMESTAMP)
CONVERTVALUES_LISTSLIKE_CASE(FloatType, FLOAT)
CONVERTVALUES_LISTSLIKE_CASE(DoubleType, DOUBLE)
diff --git a/cpp/src/arrow/python/python_to_arrow.cc
b/cpp/src/arrow/python/python_to_arrow.cc
index e42ac51..a77cebc 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -291,7 +291,7 @@ class Date32Converter : public TypedConverter<Date32Type,
Date32Converter> {
int32_t t;
if (PyDate_Check(obj)) {
auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
- t = static_cast<int32_t>(PyDate_to_s(pydate));
+ t = static_cast<int32_t>(PyDate_to_days(pydate));
} else {
RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for
date32"));
}
@@ -313,16 +313,61 @@ class Date64Converter : public TypedConverter<Date64Type,
Date64Converter> {
}
};
-class TimeConverter : public TypedConverter<Time64Type, TimeConverter> {
+class Time32Converter : public TypedConverter<Time32Type, Time32Converter> {
public:
+ explicit Time32Converter(TimeUnit::type unit) : unit_(unit) {}
+
+ Status AppendItem(PyObject* obj) {
+ // TODO(kszucs): option for strict conversion?
+ int32_t t;
+ if (PyTime_Check(obj)) {
+ // datetime.time stores microsecond resolution
+ switch (unit_) {
+ case TimeUnit::SECOND:
+ t = static_cast<int32_t>(PyTime_to_s(obj));
+ break;
+ case TimeUnit::MILLI:
+ t = static_cast<int32_t>(PyTime_to_ms(obj));
+ break;
+ default:
+ return Status::UnknownError("Invalid time unit");
+ }
+ return typed_builder_->Append(t);
+ } else {
+ return internal::InvalidValue(obj, "converting to time32");
+ }
+ }
+
+ private:
+ TimeUnit::type unit_;
+};
+
+class Time64Converter : public TypedConverter<Time64Type, Time64Converter> {
+ public:
+ explicit Time64Converter(TimeUnit::type unit) : unit_(unit) {}
+
Status AppendItem(PyObject* obj) {
+ int64_t t;
if (PyTime_Check(obj)) {
// datetime.time stores microsecond resolution
- return typed_builder_->Append(PyTime_to_us(obj));
+ switch (unit_) {
+ case TimeUnit::MICRO:
+ t = PyTime_to_us(obj);
+ break;
+ case TimeUnit::NANO:
+ t = PyTime_to_ns(obj);
+ break;
+ default:
+ return Status::UnknownError("Invalid time unit");
+ }
+ return typed_builder_->Append(t);
} else {
return internal::InvalidValue(obj, "converting to time64");
}
}
+
+ private:
+ TimeUnit::type unit_;
};
class TimestampConverter : public TypedConverter<TimestampType,
TimestampConverter> {
@@ -647,6 +692,7 @@ Status ListConverter::AppendNdarrayItem(PyObject* obj) {
LIST_FAST_CASE(INT64, NPY_INT64, Int64Type)
LIST_SLOW_CASE(DATE32)
LIST_SLOW_CASE(DATE64)
+ LIST_SLOW_CASE(TIME32)
LIST_SLOW_CASE(TIME64)
LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType)
@@ -826,11 +872,14 @@ Status GetConverter(const std::shared_ptr<DataType>&
type, bool from_pandas,
NUMERIC_CONVERTER(UINT16, UInt16Type);
NUMERIC_CONVERTER(UINT32, UInt32Type);
NUMERIC_CONVERTER(UINT64, UInt64Type);
- SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
- SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
NUMERIC_CONVERTER(HALF_FLOAT, HalfFloatType);
NUMERIC_CONVERTER(FLOAT, FloatType);
NUMERIC_CONVERTER(DOUBLE, DoubleType);
+ SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter);
+ SIMPLE_CONVERTER_CASE(BINARY, BytesConverter);
+ SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter);
+ SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
+ SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
case Type::STRING:
if (strict_conversions) {
*out = std::unique_ptr<SeqConverter>(new StringConverter<true>());
@@ -838,18 +887,21 @@ Status GetConverter(const std::shared_ptr<DataType>&
type, bool from_pandas,
*out = std::unique_ptr<SeqConverter>(new StringConverter<false>());
}
break;
- SIMPLE_CONVERTER_CASE(BINARY, BytesConverter);
- SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter);
+ case Type::TIME32: {
+ *out = std::unique_ptr<SeqConverter>(
+ new Time32Converter(checked_cast<const Time32Type&>(*type).unit()));
+ break;
+ }
+ case Type::TIME64: {
+ *out = std::unique_ptr<SeqConverter>(
+ new Time64Converter(checked_cast<const Time64Type&>(*type).unit()));
+ break;
+ }
case Type::TIMESTAMP: {
*out = std::unique_ptr<SeqConverter>(
new TimestampConverter(checked_cast<const
TimestampType&>(*type).unit()));
break;
}
- case Type::TIME32: {
- return Status::NotImplemented("No sequence converter for time32
available");
- }
- SIMPLE_CONVERTER_CASE(TIME64, TimeConverter);
- SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter);
case Type::LIST:
*out = std::unique_ptr<SeqConverter>(
new ListConverter(from_pandas, strict_conversions));
diff --git a/cpp/src/arrow/python/util/datetime.h
b/cpp/src/arrow/python/util/datetime.h
index d39178d..7350dea 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -163,6 +163,18 @@ static inline int64_t PyTime_to_us(PyObject* pytime) {
PyDateTime_TIME_GET_MICROSECOND(pytime));
}
+static inline int64_t PyTime_to_s(PyObject* pytime) {
+ return PyTime_to_us(pytime) / 1000000;
+}
+
+static inline int64_t PyTime_to_ms(PyObject* pytime) {
+ return PyTime_to_us(pytime) / 1000;
+}
+
+static inline int64_t PyTime_to_ns(PyObject* pytime) {
+ return PyTime_to_us(pytime) * 1000;
+}
+
// Splitting time quantities, for example splitting total seconds into
// minutes and remaining seconds. After we run
// int64_t remaining = split_time(total, quotient, &next)
@@ -256,7 +268,7 @@ static inline Status PyDateTime_from_int(int64_t val, const
TimeUnit::type unit,
return Status::OK();
}
-static inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+static inline int64_t PyDate_to_days(PyDateTime_Date* pydate) {
return get_days_from_date(PyDateTime_GET_YEAR(pydate),
PyDateTime_GET_MONTH(pydate),
PyDateTime_GET_DAY(pydate));
}
@@ -293,10 +305,6 @@ static inline int64_t
PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
return PyDateTime_to_us(pydatetime) * 1000;
}
-static inline int32_t PyDate_to_days(PyDateTime_Date* pydate) {
- return static_cast<int32_t>(PyDate_to_ms(pydate) / 86400000LL);
-}
-
} // namespace py
} // namespace arrow
diff --git a/python/pyarrow/tests/pandas_examples.py
b/python/pyarrow/tests/pandas_examples.py
index 65791d4..78a60f1 100644
--- a/python/pyarrow/tests/pandas_examples.py
+++ b/python/pyarrow/tests/pandas_examples.py
@@ -17,6 +17,7 @@
# under the License.
from collections import OrderedDict
+from datetime import date, time
import numpy as np
import pandas as pd
@@ -80,7 +81,7 @@ def dataframe_with_arrays(include_index=False):
return df, schema
-def dataframe_with_lists(include_index=False):
+def dataframe_with_lists(include_index=False, parquet_compatible=False):
"""
Dataframe with list columns of every possible primtive type.
@@ -89,6 +90,8 @@ def dataframe_with_lists(include_index=False):
df: pandas.DataFrame
schema: pyarrow.Schema
Arrow schema definition that is in line with the constructed df.
+ parquet_compatible: bool
+ Exclude types not supported by parquet
"""
arrays = OrderedDict()
fields = []
@@ -127,8 +130,43 @@ def dataframe_with_lists(include_index=False):
[],
]
+ date_data = [
+ [],
+ [date(2018, 1, 1), date(2032, 12, 30)],
+ [date(2000, 6, 7)],
+ None,
+ [date(1969, 6, 9), date(1972, 7, 3)]
+ ]
+ time_data = [
+ [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
+ [],
+ [time(22, 5, 59)],
+ None,
+ [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
+ ]
+
+ temporal_pairs = [
+ (pa.date32(), date_data),
+ (pa.date64(), date_data),
+ (pa.time32('s'), time_data),
+ (pa.time32('ms'), time_data),
+ (pa.time64('us'), time_data)
+ ]
+ if not parquet_compatible:
+ temporal_pairs += [
+ (pa.time64('ns'), time_data),
+ ]
+
+ for value_type, data in temporal_pairs:
+ field_name = '{}_list'.format(value_type)
+ field_type = pa.list_(value_type)
+ field = pa.field(field_name, field_type)
+ fields.append(field)
+ arrays[field_name] = data
+
if include_index:
fields.append(pa.field('__index_level_0__', pa.int64()))
+
df = pd.DataFrame(arrays)
schema = pa.schema(fields)
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index a3d8725..f3391ce 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -27,7 +27,7 @@ import pandas as pd
import pandas.util.testing as tm
import pyarrow as pa
-from pyarrow.compat import guid, u, BytesIO, unichar
+from pyarrow.compat import guid, u, BytesIO, unichar, PY2
from pyarrow.tests import util
from pyarrow.filesystem import LocalFileSystem
from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
@@ -797,14 +797,23 @@ def test_coerce_timestamps_truncated(tempdir):
def test_column_of_lists(tempdir):
- df, schema = dataframe_with_lists()
+ df, schema = dataframe_with_lists(parquet_compatible=True)
filename = tempdir / 'pandas_rountrip.parquet'
arrow_table = pa.Table.from_pandas(df, schema=schema)
- _write_table(arrow_table, filename, version='2.0',
- coerce_timestamps='ms')
+ _write_table(arrow_table, filename, version='2.0')
table_read = _read_table(filename)
df_read = table_read.to_pandas()
+
+ if PY2:
+ # assert_frame_equal fails when comparing datetime.date and
+ # np.datetime64, even with check_datetimelike_compat=True so
+ # convert the values to np.datetime64 instead
+ for col in ['date32[day]_list', 'date64[ms]_list']:
+ df[col] = df[col].apply(
+ lambda x: list(map(np.datetime64, x)) if x else x
+ )
+
tm.assert_frame_equal(df, df_read)