Repository: arrow Updated Branches: refs/heads/master e8945325e -> 7870804e0
ARROW-1074: Support lists and arrays in pandas DataFrames without explicit schema This introduces automatic type inference for lists and numpy arrays in a pandas data frame. Partial implementation for: https://issues.apache.org/jira/browse/ARROW-575 Author: fjetter <florian.jet...@blue-yonder.com> Closes #825 from fjetter/feature/pandas_converter_lists and squashes the following commits: 8bde4e7 [fjetter] Use unicode instead of str in tests 6d262e9 [fjetter] Use OwnedRef reset 037cc77 [fjetter] apply clang-format 331f8a7 [fjetter] Fix bus error 506666f [fjetter] Support numpy array in sequential visitor b54c1f5 [fjetter] Factor out InferArrowType 4a61585 [fjetter] Add vscode config files to gitignore 6dee516 [fjetter] infer lists in pandas converter Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7870804e Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7870804e Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7870804e Branch: refs/heads/master Commit: 7870804e0ea370b0e56811769e5252e6aba69e34 Parents: e894532 Author: fjetter <florian.jet...@blue-yonder.com> Authored: Mon Jul 10 16:22:05 2017 +0200 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Mon Jul 10 16:22:05 2017 +0200 ---------------------------------------------------------------------- .gitignore | 3 +- cpp/src/arrow/python/builtin_convert.cc | 42 ++++++++++++++++-------- cpp/src/arrow/python/builtin_convert.h | 2 ++ cpp/src/arrow/python/pandas_convert.cc | 13 ++++++-- python/pyarrow/tests/test_convert_pandas.py | 30 +++++++++++++++++ 5 files changed, 72 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 5e28b36..dd69b6c 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ MANIFEST cpp/.idea/ -python/.eggs/ \ No newline at end of file +python/.eggs/ +.vscode \ No newline at end of file http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/cpp/src/arrow/python/builtin_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 11114b0..f10dac7 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -155,14 +155,20 @@ class SeqVisitor { // co-recursive with VisitElem Status Visit(PyObject* obj, int level = 0) { if (level > max_nesting_level_) { max_nesting_level_ = level; } - // Loop through either a sequence or an iterator. if (PySequence_Check(obj)) { Py_ssize_t size = PySequence_Size(obj); for (int64_t i = 0; i < size; ++i) { - // TODO(wesm): Specialize for PyList_GET_ITEM? - OwnedRef ref = OwnedRef(PySequence_GetItem(obj, i)); - RETURN_NOT_OK(VisitElem(ref, level)); + OwnedRef ref; + if (PyArray_Check(obj)) { + auto array = reinterpret_cast<PyArrayObject*>(obj); + auto ptr = reinterpret_cast<const char*>(PyArray_GETPTR1(array, i)); + ref.reset(PyArray_GETITEM(array, ptr)); + RETURN_NOT_OK(VisitElem(ref, level)); + } else { + ref.reset(PySequence_GetItem(obj, i)); + RETURN_NOT_OK(VisitElem(ref, level)); + } } } else if (PyObject_HasAttrString(obj, "__iter__")) { OwnedRef iter = OwnedRef(PyObject_GetIter(obj)); @@ -280,25 +286,32 @@ Status InferArrowSize(PyObject* obj, int64_t* size) { } // Non-exhaustive type inference -Status InferArrowTypeAndSize( - PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) { - RETURN_NOT_OK(InferArrowSize(obj, size)); - - // For 0-length sequences, refuse to guess - if (*size == 0) { *out_type = null(); } - +Status InferArrowType(PyObject* obj, std::shared_ptr<DataType>* out_type) { PyDateTime_IMPORT; SeqVisitor seq_visitor; RETURN_NOT_OK(seq_visitor.Visit(obj)); RETURN_NOT_OK(seq_visitor.Validate()); *out_type = seq_visitor.GetType(); - if (*out_type == nullptr) { return Status::TypeError("Unable to determine data type"); } return Status::OK(); } +Status InferArrowTypeAndSize( + PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) { + RETURN_NOT_OK(InferArrowSize(obj, size)); + + // For 0-length sequences, refuse to guess + if (*size == 0) { + *out_type = null(); + return Status::OK(); + } + RETURN_NOT_OK(InferArrowType(obj, out_type)); + + return Status::OK(); +} + // Marshal Python sequence (list, tuple, etc.) to Arrow array class SeqConverter { public: @@ -464,8 +477,9 @@ class FixedWidthBytesConverter inline Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; - Py_ssize_t expected_length = std::dynamic_pointer_cast<FixedSizeBinaryType>( - typed_builder_->type())->byte_width(); + Py_ssize_t expected_length = + std::dynamic_pointer_cast<FixedSizeBinaryType>(typed_builder_->type()) + ->byte_width(); if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); return Status::OK(); http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/cpp/src/arrow/python/builtin_convert.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h index 7a84cbe..dd878b2 100644 --- a/cpp/src/arrow/python/builtin_convert.h +++ b/cpp/src/arrow/python/builtin_convert.h @@ -38,6 +38,8 @@ class Status; namespace py { +ARROW_EXPORT arrow::Status InferArrowType( + PyObject* obj, std::shared_ptr<arrow::DataType>* out_type); ARROW_EXPORT arrow::Status InferArrowTypeAndSize( PyObject* obj, int64_t* size, std::shared_ptr<arrow::DataType>* out_type); ARROW_EXPORT arrow::Status InferArrowSize(PyObject* obj, int64_t* size); http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index f75a2ba..2364f13 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -893,9 +893,13 @@ Status PandasConverter::ConvertObjects() { return ConvertDates<Date32Type>(); } else if (PyObject_IsInstance(const_cast<PyObject*>(objects[i]), Decimal.obj())) { return ConvertDecimals(); + } else if (PyList_Check(objects[i]) || PyArray_Check(objects[i])) { + std::shared_ptr<DataType> inferred_type; + RETURN_NOT_OK(InferArrowType(objects[i], &inferred_type)); + return ConvertLists(inferred_type); } else { - return InvalidConversion( - const_cast<PyObject*>(objects[i]), "string, bool, float, int, date, decimal"); + return InvalidConversion(const_cast<PyObject*>(objects[i]), + "string, bool, float, int, date, decimal, list, array"); } } } @@ -1038,7 +1042,10 @@ Status PandasConverter::ConvertLists(const std::shared_ptr<DataType>& type) { LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) LIST_CASE(STRING, NPY_OBJECT, StringType) default: - return Status::TypeError("Unknown list item type"); + std::stringstream ss; + ss << "Unknown list item type: "; + ss << type->ToString(); + return Status::TypeError(ss.str()); } return Status::TypeError("Unknown list type"); http://git-wip-us.apache.org/repos/asf/arrow/blob/7870804e/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index b952d4a..fb69cac 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -691,3 +691,33 @@ class TestPandasConversion(unittest.TestCase): series = pd.Series(arr.to_pandas()) tm.assert_series_equal(series, expected) + + def test_infer_lists(self): + data = OrderedDict([ + ('nan_ints', [[None, 1], [2, 3]]), + ('ints', [[0, 1], [2, 3]]), + ('strs', [[None, u'b'], [u'c', u'd']]) + ]) + df = pd.DataFrame(data) + + expected_schema = pa.schema([ + pa.field('nan_ints', pa.list_(pa.int64())), + pa.field('ints', pa.list_(pa.int64())), + pa.field('strs', pa.list_(pa.string())) + ]) + + self._check_pandas_roundtrip(df, expected_schema=expected_schema) + + def test_infer_numpy_array(self): + data = OrderedDict([ + ('ints', [ + np.array([0, 1], dtype=np.int64), + np.array([2, 3], dtype=np.int64) + ]) + ]) + df = pd.DataFrame(data) + expected_schema = pa.schema([ + pa.field('ints', pa.list_(pa.int64())) + ]) + + self._check_pandas_roundtrip(df, expected_schema=expected_schema)