This is an automated email from the ASF dual-hosted git repository. kou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit b46236c73fa41b91d6368ce10930be65567fe386 Author: Krisztián Szűcs <[email protected]> AuthorDate: Sun Oct 7 13:28:18 2018 -0400 ARROW-3225: [C++/Python] Pandas object conversion of ListType<DateType> and ListType<TimeType> Author: Krisztián Szűcs <[email protected]> Closes #2712 from kszucs/ARROW-3225 and squashes the following commits: 2579605e5 <Krisztián Szűcs> check-format 92ccd58c9 <Krisztián Szűcs> static casts 4688de277 <Krisztián Szűcs> flake8 9ee8f17a5 <Krisztián Szűcs> test compatibility f65e3d2f0 <Krisztián Szűcs> remove unused datetime function 308b6b162 <Krisztián Szűcs> create example df with only parquet compatible types f16d2cd96 <Krisztián Szűcs> pandas roundtrip with list of time and list of date objects --- cpp/src/arrow/python/arrow_to_pandas.cc | 13 +++++- cpp/src/arrow/python/python_to_arrow.cc | 76 +++++++++++++++++++++++++++------ cpp/src/arrow/python/util/datetime.h | 18 +++++--- python/pyarrow/tests/pandas_examples.py | 40 ++++++++++++++++- python/pyarrow/tests/test_parquet.py | 17 ++++++-- 5 files changed, 141 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 67510c4..f342a34 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -109,6 +109,10 @@ static inline bool ListTypeSupported(const DataType& type) { case Type::DOUBLE: case Type::BINARY: case Type::STRING: + case Type::DATE32: + case Type::DATE64: + case Type::TIME32: + case Type::TIME64: case Type::TIMESTAMP: case Type::NA: // empty list // The above types are all supported. @@ -791,6 +795,10 @@ class ObjectBlock : public PandasBlock { CONVERTLISTSLIKE_CASE(Int32Type, INT32) CONVERTLISTSLIKE_CASE(UInt64Type, UINT64) CONVERTLISTSLIKE_CASE(Int64Type, INT64) + CONVERTLISTSLIKE_CASE(Date32Type, DATE32) + CONVERTLISTSLIKE_CASE(Date64Type, DATE64) + CONVERTLISTSLIKE_CASE(Time32Type, TIME32) + CONVERTLISTSLIKE_CASE(Time64Type, TIME64) CONVERTLISTSLIKE_CASE(TimestampType, TIMESTAMP) CONVERTLISTSLIKE_CASE(FloatType, FLOAT) CONVERTLISTSLIKE_CASE(DoubleType, DOUBLE) @@ -798,7 +806,6 @@ class ObjectBlock : public PandasBlock { CONVERTLISTSLIKE_CASE(StringType, STRING) CONVERTLISTSLIKE_CASE(ListType, LIST) CONVERTLISTSLIKE_CASE(NullType, NA) - // TODO(kszucs) Time and Date? default: { std::stringstream ss; ss << "Not implemented type for conversion from List to Pandas ObjectBlock: " @@ -1869,6 +1876,10 @@ class ArrowDeserializer { CONVERTVALUES_LISTSLIKE_CASE(Int32Type, INT32) CONVERTVALUES_LISTSLIKE_CASE(UInt64Type, UINT64) CONVERTVALUES_LISTSLIKE_CASE(Int64Type, INT64) + CONVERTVALUES_LISTSLIKE_CASE(Date32Type, DATE32) + CONVERTVALUES_LISTSLIKE_CASE(Date64Type, DATE64) + CONVERTVALUES_LISTSLIKE_CASE(Time32Type, TIME32) + CONVERTVALUES_LISTSLIKE_CASE(Time64Type, TIME64) CONVERTVALUES_LISTSLIKE_CASE(TimestampType, TIMESTAMP) CONVERTVALUES_LISTSLIKE_CASE(FloatType, FLOAT) CONVERTVALUES_LISTSLIKE_CASE(DoubleType, DOUBLE) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index e42ac51..a77cebc 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -291,7 +291,7 @@ class Date32Converter : public TypedConverter<Date32Type, Date32Converter> { int32_t t; if (PyDate_Check(obj)) { auto pydate = reinterpret_cast<PyDateTime_Date*>(obj); - t = static_cast<int32_t>(PyDate_to_s(pydate)); + t = static_cast<int32_t>(PyDate_to_days(pydate)); } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date32")); } @@ -313,16 +313,61 @@ class Date64Converter : public TypedConverter<Date64Type, Date64Converter> { } }; -class TimeConverter : public TypedConverter<Time64Type, TimeConverter> { +class Time32Converter : public TypedConverter<Time32Type, Time32Converter> { public: + explicit Time32Converter(TimeUnit::type unit) : unit_(unit) {} + + Status AppendItem(PyObject* obj) { + // TODO(kszucs): option for strict conversion? + int32_t t; + if (PyTime_Check(obj)) { + // datetime.time stores microsecond resolution + switch (unit_) { + case TimeUnit::SECOND: + t = static_cast<int32_t>(PyTime_to_s(obj)); + break; + case TimeUnit::MILLI: + t = static_cast<int32_t>(PyTime_to_ms(obj)); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + return typed_builder_->Append(t); + } else { + return internal::InvalidValue(obj, "converting to time32"); + } + } + + private: + TimeUnit::type unit_; +}; + +class Time64Converter : public TypedConverter<Time64Type, Time64Converter> { + public: + explicit Time64Converter(TimeUnit::type unit) : unit_(unit) {} + Status AppendItem(PyObject* obj) { + int64_t t; if (PyTime_Check(obj)) { // datetime.time stores microsecond resolution - return typed_builder_->Append(PyTime_to_us(obj)); + switch (unit_) { + case TimeUnit::MICRO: + t = PyTime_to_us(obj); + break; + case TimeUnit::NANO: + t = PyTime_to_ns(obj); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + return typed_builder_->Append(t); } else { return internal::InvalidValue(obj, "converting to time64"); } } + + private: + TimeUnit::type unit_; }; class TimestampConverter : public TypedConverter<TimestampType, TimestampConverter> { @@ -647,6 +692,7 @@ Status ListConverter::AppendNdarrayItem(PyObject* obj) { LIST_FAST_CASE(INT64, NPY_INT64, Int64Type) LIST_SLOW_CASE(DATE32) LIST_SLOW_CASE(DATE64) + LIST_SLOW_CASE(TIME32) LIST_SLOW_CASE(TIME64) LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType) @@ -826,11 +872,14 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas, NUMERIC_CONVERTER(UINT16, UInt16Type); NUMERIC_CONVERTER(UINT32, UInt32Type); NUMERIC_CONVERTER(UINT64, UInt64Type); - SIMPLE_CONVERTER_CASE(DATE32, Date32Converter); - SIMPLE_CONVERTER_CASE(DATE64, Date64Converter); NUMERIC_CONVERTER(HALF_FLOAT, HalfFloatType); NUMERIC_CONVERTER(FLOAT, FloatType); NUMERIC_CONVERTER(DOUBLE, DoubleType); + SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); + SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); + SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter); + SIMPLE_CONVERTER_CASE(DATE32, Date32Converter); + SIMPLE_CONVERTER_CASE(DATE64, Date64Converter); case Type::STRING: if (strict_conversions) { *out = std::unique_ptr<SeqConverter>(new StringConverter<true>()); @@ -838,18 +887,21 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas, *out = std::unique_ptr<SeqConverter>(new StringConverter<false>()); } break; - SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); - SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter); + case Type::TIME32: { + *out = std::unique_ptr<SeqConverter>( + new Time32Converter(checked_cast<const Time32Type&>(*type).unit())); + break; + } + case Type::TIME64: { + *out = std::unique_ptr<SeqConverter>( + new Time64Converter(checked_cast<const Time64Type&>(*type).unit())); + break; + } case Type::TIMESTAMP: { *out = std::unique_ptr<SeqConverter>( new TimestampConverter(checked_cast<const TimestampType&>(*type).unit())); break; } - case Type::TIME32: { - return Status::NotImplemented("No sequence converter for time32 available"); - } - SIMPLE_CONVERTER_CASE(TIME64, TimeConverter); - SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); case Type::LIST: *out = std::unique_ptr<SeqConverter>( new ListConverter(from_pandas, strict_conversions)); diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index d39178d..7350dea 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -163,6 +163,18 @@ static inline int64_t PyTime_to_us(PyObject* pytime) { PyDateTime_TIME_GET_MICROSECOND(pytime)); } +static inline int64_t PyTime_to_s(PyObject* pytime) { + return PyTime_to_us(pytime) / 1000000; +} + +static inline int64_t PyTime_to_ms(PyObject* pytime) { + return PyTime_to_us(pytime) / 1000; +} + +static inline int64_t PyTime_to_ns(PyObject* pytime) { + return PyTime_to_us(pytime) * 1000; +} + // Splitting time quantities, for example splitting total seconds into // minutes and remaining seconds. After we run // int64_t remaining = split_time(total, quotient, &next) @@ -256,7 +268,7 @@ static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, return Status::OK(); } -static inline int64_t PyDate_to_s(PyDateTime_Date* pydate) { +static inline int64_t PyDate_to_days(PyDateTime_Date* pydate) { return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), PyDateTime_GET_DAY(pydate)); } @@ -293,10 +305,6 @@ static inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { return PyDateTime_to_us(pydatetime) * 1000; } -static inline int32_t PyDate_to_days(PyDateTime_Date* pydate) { - return static_cast<int32_t>(PyDate_to_ms(pydate) / 86400000LL); -} - } // namespace py } // namespace arrow diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py index 65791d4..78a60f1 100644 --- a/python/pyarrow/tests/pandas_examples.py +++ b/python/pyarrow/tests/pandas_examples.py @@ -17,6 +17,7 @@ # under the License. from collections import OrderedDict +from datetime import date, time import numpy as np import pandas as pd @@ -80,7 +81,7 @@ def dataframe_with_arrays(include_index=False): return df, schema -def dataframe_with_lists(include_index=False): +def dataframe_with_lists(include_index=False, parquet_compatible=False): """ Dataframe with list columns of every possible primtive type. @@ -89,6 +90,8 @@ def dataframe_with_lists(include_index=False): df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. + parquet_compatible: bool + Exclude types not supported by parquet """ arrays = OrderedDict() fields = [] @@ -127,8 +130,43 @@ def dataframe_with_lists(include_index=False): [], ] + date_data = [ + [], + [date(2018, 1, 1), date(2032, 12, 30)], + [date(2000, 6, 7)], + None, + [date(1969, 6, 9), date(1972, 7, 3)] + ] + time_data = [ + [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], + [], + [time(22, 5, 59)], + None, + [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)] + ] + + temporal_pairs = [ + (pa.date32(), date_data), + (pa.date64(), date_data), + (pa.time32('s'), time_data), + (pa.time32('ms'), time_data), + (pa.time64('us'), time_data) + ] + if not parquet_compatible: + temporal_pairs += [ + (pa.time64('ns'), time_data), + ] + + for value_type, data in temporal_pairs: + field_name = '{}_list'.format(value_type) + field_type = pa.list_(value_type) + field = pa.field(field_name, field_type) + fields.append(field) + arrays[field_name] = data + if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) + df = pd.DataFrame(arrays) schema = pa.schema(fields) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a3d8725..f3391ce 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -27,7 +27,7 @@ import pandas as pd import pandas.util.testing as tm import pyarrow as pa -from pyarrow.compat import guid, u, BytesIO, unichar +from pyarrow.compat import guid, u, BytesIO, unichar, PY2 from pyarrow.tests import util from pyarrow.filesystem import LocalFileSystem from .pandas_examples import dataframe_with_arrays, dataframe_with_lists @@ -797,14 +797,23 @@ def test_coerce_timestamps_truncated(tempdir): def test_column_of_lists(tempdir): - df, schema = dataframe_with_lists() + df, schema = dataframe_with_lists(parquet_compatible=True) filename = tempdir / 'pandas_rountrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) - _write_table(arrow_table, filename, version='2.0', - coerce_timestamps='ms') + _write_table(arrow_table, filename, version='2.0') table_read = _read_table(filename) df_read = table_read.to_pandas() + + if PY2: + # assert_frame_equal fails when comparing datetime.date and + # np.datetime64, even with check_datetimelike_compat=True so + # convert the values to np.datetime64 instead + for col in ['date32[day]_list', 'date64[ms]_list']: + df[col] = df[col].apply( + lambda x: list(map(np.datetime64, x)) if x else x + ) + tm.assert_frame_equal(df, df_read)
