Repository: arrow Updated Branches: refs/heads/master 608b89e16 -> a16c1246e
ARROW-1137: Python: Ensure Pandas roundtrip of all-None column Change-Id: Ib815d3fa42f0a0ae6c0d9850e9b0b435bad1c331 Author: Uwe L. Korn <uw...@xhochy.com> Closes #764 from xhochy/ARROW-1137 and squashes the following commits: 59c0df89 [Uwe L. Korn] Remove unused variables 1d11513f [Uwe L. Korn] ARROW-1137: Python: Ensure Pandas roundtrip of all-None column Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/a16c1246 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/a16c1246 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/a16c1246 Branch: refs/heads/master Commit: a16c1246ec25a020cacc1330650ac7d4cfd6d230 Parents: 608b89e Author: Uwe L. Korn <uw...@xhochy.com> Authored: Thu Jun 22 18:06:12 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Thu Jun 22 18:06:12 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/python/pandas_convert.cc | 63 +++++++++++++++++++++--- python/pyarrow/tests/test_convert_pandas.py | 5 ++ 2 files changed, 61 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/a16c1246/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index ac61cbc..6b0e342 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -1305,6 +1305,22 @@ inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) return Status::OK(); } +inline Status ConvertNulls(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + for (int c = 0; c < data.num_chunks(); c++) { + std::shared_ptr<Array> arr = data.chunk(c); + + const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr->length(); ++i) { + // All values are null + Py_INCREF(Py_None); + *out_values = Py_None; + ++out_values; + } + } + return Status::OK(); +} + inline Status ConvertFixedSizeBinary(const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { @@ -1457,6 +1473,8 @@ class ObjectBlock : public PandasBlock { RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer)); } else if (type == Type::DECIMAL) { RETURN_NOT_OK(ConvertDecimals(data, out_buffer)); + } else if (type == Type::NA) { + RETURN_NOT_OK(ConvertNulls(data, out_buffer)); } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast<ListType>(col->type()); switch (list_type->value_type()->id()) { @@ -1506,7 +1524,12 @@ class IntBlock : public PandasBlock { const ChunkedArray& data = *col->data().get(); - if (type != ARROW_TYPE) { return Status::NotImplemented(col->type()->ToString()); } + if (type != ARROW_TYPE) { + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas int" << sizeof(C_TYPE) << " block."; + return Status::NotImplemented(ss.str()); + } ConvertIntegerNoNullsSameType<C_TYPE>(data, out_buffer); placement_data_[rel_placement] = abs_placement; @@ -1532,7 +1555,12 @@ class Float32Block : public PandasBlock { int64_t rel_placement) override { Type::type type = col->type()->id(); - if (type != Type::FLOAT) { return Status::NotImplemented(col->type()->ToString()); } + if (type != Type::FLOAT) { + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas float32 block."; + return Status::NotImplemented(ss.str()); + } float* out_buffer = reinterpret_cast<float*>(block_data_) + rel_placement * num_rows_; @@ -1584,7 +1612,10 @@ class Float64Block : public PandasBlock { ConvertNumericNullable<double>(data, NAN, out_buffer); break; default: - return Status::NotImplemented(col->type()->ToString()); + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas float64 block."; + return Status::NotImplemented(ss.str()); } #undef INTEGER_CASE @@ -1603,7 +1634,12 @@ class BoolBlock : public PandasBlock { int64_t rel_placement) override { Type::type type = col->type()->id(); - if (type != Type::BOOL) { return Status::NotImplemented(col->type()->ToString()); } + if (type != Type::BOOL) { + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas boolean block."; + return Status::NotImplemented(ss.str()); + } uint8_t* out_buffer = reinterpret_cast<uint8_t*>(block_data_) + rel_placement * num_rows_; @@ -1660,7 +1696,10 @@ class DatetimeBlock : public PandasBlock { return Status::NotImplemented("Unsupported time unit"); } } else { - return Status::NotImplemented(col->type()->ToString()); + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas datetime block."; + return Status::NotImplemented(ss.str()); } placement_data_[rel_placement] = abs_placement; @@ -1917,8 +1956,14 @@ class DataFrameBlockCreator { case Type::DECIMAL: output_type = PandasBlock::DECIMAL; break; + case Type::NA: + output_type = PandasBlock::OBJECT; + break; default: - return Status::NotImplemented(col->type()->ToString()); + std::stringstream ss; + ss << "No known equivalent Pandas block for Arrow data of type "; + ss << col->type()->ToString() << " is known."; + return Status::NotImplemented(ss.str()); } int block_placement = 0; @@ -2301,7 +2346,11 @@ class ArrowDeserializer { return Status::OK(); } - Status Visit(const NullType& type) { return Status::NotImplemented("null type"); } + Status Visit(const NullType& type) { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); + return ConvertNulls(data_, out_values); + } Status Visit(const StructType& type) { return Status::NotImplemented("struct type"); } http://git-wip-us.apache.org/repos/asf/arrow/blob/a16c1246/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index ca30455..d17ef3c 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -98,6 +98,11 @@ class TestPandasConversion(unittest.TestCase): tm.assert_series_equal(pd.Series(result), expected, check_names=False) + def test_all_none_objects(self): + df = pd.DataFrame({'a': [None, None, None]}) + self._check_pandas_roundtrip(df) + + def test_float_no_nulls(self): data = {} fields = []