Repository: arrow Updated Branches: refs/heads/master 2d6453b25 -> 02161456c
ARROW-991: [Python] Create new dtype when deserializing from Arrow to NumPy datetime64 I am not sure how to reproduce the problem, but this will prevent us from mutating a cached dtype in the NumPy internals Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #664 from wesm/ARROW-991 and squashes the following commits: 088aa25 [Wes McKinney] Also new PyArray_NewFromDescr for blocks. Fixes pandas test suite ee1904e [Wes McKinney] Remove unit test 2e70b08 [Wes McKinney] Always create new NPY_DATETIME dtype when converting to pandas Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/02161456 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/02161456 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/02161456 Branch: refs/heads/master Commit: 02161456c3de5199ca0304484aff14fb7349bca4 Parents: 2d6453b Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Tue May 9 20:59:42 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Tue May 9 20:59:42 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/python/common.cc | 2 +- cpp/src/arrow/python/common.h | 10 +-- cpp/src/arrow/python/pandas_convert.cc | 103 ++++++++++++++++------------ cpp/src/arrow/status.h | 3 +- cpp/src/arrow/type.cc | 11 ++- cpp/src/arrow/type.h | 6 +- 6 files changed, 70 insertions(+), 65 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/common.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/common.cc b/cpp/src/arrow/python/common.cc index 5702c71..ba7b6cf 100644 --- a/cpp/src/arrow/python/common.cc +++ b/cpp/src/arrow/python/common.cc @@ -69,7 +69,7 @@ Status CheckPyError(StatusCode code) { PyObject *exc_type, *exc_value, *traceback; PyErr_Fetch(&exc_type, &exc_value, &traceback); PyErr_NormalizeException(&exc_type, &exc_value, &traceback); - PyObject *exc_value_str = PyObject_Str(exc_value); + PyObject* exc_value_str = PyObject_Str(exc_value); PyObjectStringify stringified(exc_value_str); std::string message(stringified.bytes); Py_XDECREF(exc_type); http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index c5745a5..f6e706b 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -34,9 +34,7 @@ namespace py { class ARROW_EXPORT PyAcquireGIL { public: - PyAcquireGIL() : acquired_gil_(false) { - acquire(); - } + PyAcquireGIL() : acquired_gil_(false) { acquire(); } ~PyAcquireGIL() { release(); } @@ -113,11 +111,9 @@ struct ARROW_EXPORT PyObjectStringify { Status CheckPyError(StatusCode code = StatusCode::UnknownError); // TODO(wesm): We can just let errors pass through. To be explored later -#define RETURN_IF_PYERROR() \ - RETURN_NOT_OK(CheckPyError()); +#define RETURN_IF_PYERROR() RETURN_NOT_OK(CheckPyError()); -#define PY_RETURN_IF_ERROR(CODE) \ - RETURN_NOT_OK(CheckPyError(CODE)); +#define PY_RETURN_IF_ERROR(CODE) RETURN_NOT_OK(CheckPyError(CODE)); // Return the common PyArrow memory pool ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool); http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index b54197e..264bed1 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -977,6 +977,55 @@ Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, // ---------------------------------------------------------------------- // pandas 0.x DataFrame conversion internals +inline void set_numpy_metadata(int type, DataType* datatype, PyArray_Descr* out) { + if (type == NPY_DATETIME) { + auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(out->c_metadata); + if (datatype->id() == Type::TIMESTAMP) { + auto timestamp_type = static_cast<TimestampType*>(datatype); + + switch (timestamp_type->unit()) { + case TimestampType::Unit::SECOND: + date_dtype->meta.base = NPY_FR_s; + break; + case TimestampType::Unit::MILLI: + date_dtype->meta.base = NPY_FR_ms; + break; + case TimestampType::Unit::MICRO: + date_dtype->meta.base = NPY_FR_us; + break; + case TimestampType::Unit::NANO: + date_dtype->meta.base = NPY_FR_ns; + break; + } + } else { + // datatype->type == Type::DATE64 + date_dtype->meta.base = NPY_FR_D; + } + } +} + +static inline PyArray_Descr* GetSafeNumPyDtype(int type) { + if (type == NPY_DATETIME) { + // It is not safe to mutate the result of DescrFromType + return PyArray_DescrNewFromType(type); + } else { + return PyArray_DescrFromType(type); + } +} +static inline PyObject* NewArray1DFromType( + DataType* arrow_type, int type, int64_t length, void* data) { + npy_intp dims[1] = {length}; + + PyArray_Descr* descr = GetSafeNumPyDtype(type); + if (descr == nullptr) { + // Error occurred, trust error state is set + return nullptr; + } + + set_numpy_metadata(type, arrow_type, descr); + return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, 0, nullptr); +} + class PandasBlock { public: enum type { @@ -1024,13 +1073,17 @@ class PandasBlock { Status AllocateNDArray(int npy_type, int ndim = 2) { PyAcquireGIL lock; + PyArray_Descr* descr = GetSafeNumPyDtype(npy_type); + PyObject* block_arr; if (ndim == 2) { npy_intp block_dims[2] = {num_columns_, num_rows_}; - block_arr = PyArray_SimpleNew(2, block_dims, npy_type); + block_arr = PyArray_NewFromDescr( + &PyArray_Type, descr, 2, block_dims, nullptr, nullptr, 0, nullptr); } else { npy_intp block_dims[1] = {num_rows_}; - block_arr = PyArray_SimpleNew(1, block_dims, npy_type); + block_arr = PyArray_NewFromDescr( + &PyArray_Type, descr, 1, block_dims, nullptr, nullptr, 0, nullptr); } if (block_arr == NULL) { @@ -1947,34 +2000,6 @@ class DataFrameBlockCreator { BlockMap datetimetz_blocks_; }; -inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) { - if (type == NPY_DATETIME) { - PyArray_Descr* descr = PyArray_DESCR(out); - auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata); - if (datatype->id() == Type::TIMESTAMP) { - auto timestamp_type = static_cast<TimestampType*>(datatype); - - switch (timestamp_type->unit()) { - case TimestampType::Unit::SECOND: - date_dtype->meta.base = NPY_FR_s; - break; - case TimestampType::Unit::MILLI: - date_dtype->meta.base = NPY_FR_ms; - break; - case TimestampType::Unit::MICRO: - date_dtype->meta.base = NPY_FR_us; - break; - case TimestampType::Unit::NANO: - date_dtype->meta.base = NPY_FR_ns; - break; - } - } else { - // datatype->type == Type::DATE64 - date_dtype->meta.base = NPY_FR_D; - } - } -} - class ArrowDeserializer { public: ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref) @@ -1983,17 +2008,8 @@ class ArrowDeserializer { Status AllocateOutput(int type) { PyAcquireGIL lock; - npy_intp dims[1] = {col_->length()}; - result_ = PyArray_SimpleNew(1, dims, type); + result_ = NewArray1DFromType(col_->type().get(), type, col_->length(), nullptr); arr_ = reinterpret_cast<PyArrayObject*>(result_); - - if (arr_ == NULL) { - // Error occurred, trust that SimpleNew set the error state - return Status::OK(); - } - - set_numpy_metadata(type, col_->type().get(), arr_); - return Status::OK(); } @@ -2010,17 +2026,14 @@ class ArrowDeserializer { PyAcquireGIL lock; // Zero-Copy. We can pass the data pointer directly to NumPy. - npy_intp dims[1] = {col_->length()}; - result_ = PyArray_SimpleNewFromData(1, dims, npy_type, data); + result_ = NewArray1DFromType(col_->type().get(), npy_type, col_->length(), data); arr_ = reinterpret_cast<PyArrayObject*>(result_); if (arr_ == NULL) { - // Error occurred, trust that SimpleNew set the error state + // Error occurred, trust that error set return Status::OK(); } - set_numpy_metadata(npy_type, col_->type().get(), arr_); - if (PyArray_SetBaseObject(arr_, py_ref_) == -1) { // Error occurred, trust that SetBaseObject set the error state return Status::OK(); http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/status.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 6a8cee2..1688b96 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -91,8 +91,7 @@ class ARROW_EXPORT Status { Status() : state_(NULL) {} ~Status() { delete[] state_; } - Status(StatusCode code, const std::string& msg) - : Status(code, msg, -1) {} + Status(StatusCode code, const std::string& msg) : Status(code, msg, -1) {} // Copy the specified status. Status(const Status& s); http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/type.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index b1e322c..afb3027 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -42,9 +42,7 @@ std::shared_ptr<Field> Field::RemoveMetadata() const { } bool Field::Equals(const Field& other) const { - if (this == &other) { - return true; - } + if (this == &other) { return true; } if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && this->type_->Equals(*other.type_.get())) { if (metadata_ == nullptr && other.metadata_ == nullptr) { @@ -322,8 +320,7 @@ std::string Schema::ToString() const { if (metadata_) { buffer << "\n-- metadata --"; for (int64_t i = 0; i < metadata_->size(); ++i) { - buffer << "\n" << metadata_->key(i) << ": " - << metadata_->value(i); + buffer << "\n" << metadata_->key(i) << ": " << metadata_->value(i); } } @@ -419,8 +416,8 @@ std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type return std::make_shared<DictionaryType>(index_type, dict_values); } -std::shared_ptr<Field> field( - const std::string& name, const std::shared_ptr<DataType>& type, bool nullable, +std::shared_ptr<Field> field(const std::string& name, + const std::shared_ptr<DataType>& type, bool nullable, const std::shared_ptr<const KeyValueMetadata>& metadata) { return std::make_shared<Field>(name, type, nullable, metadata); } http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index bb25857..40615f7 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -205,7 +205,7 @@ class ARROW_EXPORT Field { Field(const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true, const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr) - : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} + : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; } @@ -753,8 +753,8 @@ std::shared_ptr<DataType> ARROW_EXPORT union_( std::shared_ptr<DataType> ARROW_EXPORT dictionary( const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>& values); -std::shared_ptr<Field> ARROW_EXPORT field( - const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true, +std::shared_ptr<Field> ARROW_EXPORT field(const std::string& name, + const std::shared_ptr<DataType>& type, bool nullable = true, const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr); // ----------------------------------------------------------------------