[ 
https://issues.apache.org/jira/browse/ARROW-2040?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16381202#comment-16381202
 ] 

ASF GitHub Bot commented on ARROW-2040:
---------------------------------------

cpcloud closed pull request #1680: ARROW-2040: [Python] Deserialized Numpy 
array must keep ref to underlying tensor
URL: https://github.com/apache/arrow/pull/1680
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/arrow_to_python.cc 
b/cpp/src/arrow/python/arrow_to_python.cc
index 54a71d5a3..5515d24bd 100644
--- a/cpp/src/arrow/python/arrow_to_python.cc
+++ b/cpp/src/arrow/python/arrow_to_python.cc
@@ -94,7 +94,7 @@ Status DeserializeDict(PyObject* context, const Array& array, 
int64_t start_idx,
 Status DeserializeArray(const Array& array, int64_t offset, PyObject* base,
                         const SerializedPyObject& blobs, PyObject** out) {
   int32_t index = static_cast<const Int32Array&>(array).Value(offset);
-  RETURN_NOT_OK(py::TensorToNdarray(*blobs.tensors[index], base, out));
+  RETURN_NOT_OK(py::TensorToNdarray(blobs.tensors[index], base, out));
   // Mark the array as immutable
   OwnedRef flags(PyObject_GetAttrString(*out, "flags"));
   DCHECK(flags.obj() != NULL) << "Could not mark Numpy array immutable";
diff --git a/cpp/src/arrow/python/numpy_convert.cc 
b/cpp/src/arrow/python/numpy_convert.cc
index 7ba13877d..0cd616aec 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -30,6 +30,7 @@
 #include "arrow/type.h"
 
 #include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
 #include "arrow/python/type_traits.h"
 
 namespace arrow {
@@ -251,50 +252,54 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, 
std::shared_ptr<Tensor>*
   return Status::OK();
 }
 
-Status TensorToNdarray(const Tensor& tensor, PyObject* base, PyObject** out) {
+Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
+                       PyObject** out) {
   PyAcquireGIL lock;
 
   int type_num;
-  RETURN_NOT_OK(GetNumPyType(*tensor.type(), &type_num));
+  RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num));
   PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num);
   RETURN_IF_PYERROR();
 
-  std::vector<npy_intp> npy_shape(tensor.ndim());
-  std::vector<npy_intp> npy_strides(tensor.ndim());
+  const int ndim = tensor->ndim();
+  std::vector<npy_intp> npy_shape(ndim);
+  std::vector<npy_intp> npy_strides(ndim);
 
-  for (int i = 0; i < tensor.ndim(); ++i) {
-    npy_shape[i] = tensor.shape()[i];
-    npy_strides[i] = tensor.strides()[i];
+  for (int i = 0; i < ndim; ++i) {
+    npy_shape[i] = tensor->shape()[i];
+    npy_strides[i] = tensor->strides()[i];
   }
 
   const void* immutable_data = nullptr;
-  if (tensor.data()) {
-    immutable_data = tensor.data()->data();
+  if (tensor->data()) {
+    immutable_data = tensor->data()->data();
   }
 
   // Remove const =(
   void* mutable_data = const_cast<void*>(immutable_data);
 
   int array_flags = 0;
-  if (tensor.is_row_major()) {
+  if (tensor->is_row_major()) {
     array_flags |= NPY_ARRAY_C_CONTIGUOUS;
   }
-  if (tensor.is_column_major()) {
+  if (tensor->is_column_major()) {
     array_flags |= NPY_ARRAY_F_CONTIGUOUS;
   }
-  if (tensor.is_mutable()) {
+  if (tensor->is_mutable()) {
     array_flags |= NPY_ARRAY_WRITEABLE;
   }
 
   PyObject* result =
-      PyArray_NewFromDescr(&PyArray_Type, dtype, tensor.ndim(), 
npy_shape.data(),
+      PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, npy_shape.data(),
                            npy_strides.data(), mutable_data, array_flags, 
nullptr);
   RETURN_IF_PYERROR()
 
-  if (base != Py_None) {
-    PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result), base);
+  if (base == Py_None || base == nullptr) {
+    base = py::wrap_tensor(tensor);
+  } else {
     Py_XINCREF(base);
   }
+  PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result), base);
   *out = result;
   return Status::OK();
 }
diff --git a/cpp/src/arrow/python/numpy_convert.h 
b/cpp/src/arrow/python/numpy_convert.h
index 220e38f2e..dfdb1acd1 100644
--- a/cpp/src/arrow/python/numpy_convert.h
+++ b/cpp/src/arrow/python/numpy_convert.h
@@ -65,7 +65,8 @@ Status GetNumPyType(const DataType& type, int* type_num);
 ARROW_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
                                     std::shared_ptr<Tensor>* out);
 
-ARROW_EXPORT Status TensorToNdarray(const Tensor& tensor, PyObject* base, 
PyObject** out);
+ARROW_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, 
PyObject* base,
+                                    PyObject** out);
 
 }  // namespace py
 }  // namespace arrow
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index a43bfb93b..5b8621f13 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -518,7 +518,7 @@ strides: {2}""".format(self.type, self.shape, self.strides)
             PyObject* out
 
         with nogil:
-            check_status(TensorToNdarray(deref(self.tp), self, &out))
+            check_status(TensorToNdarray(self.sp_tensor, self, &out))
         return PyObject_to_object(out)
 
     def equals(self, Tensor other):
diff --git a/python/pyarrow/includes/common.pxd 
b/python/pyarrow/includes/common.pxd
index f323feaff..4d799ecd2 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -32,6 +32,7 @@ cdef extern from "arrow/python/platform.h":
 
 cdef extern from "<Python.h>":
     void Py_XDECREF(PyObject* o)
+    Py_ssize_t Py_REFCNT(PyObject* o)
 
 cdef extern from "arrow/api.h" namespace "arrow" nogil:
     # We can later add more of the common status factory methods as needed
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 8da126aaf..900c3a597 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -871,7 +871,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
     CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
                             shared_ptr[CTensor]* out)
 
-    CStatus TensorToNdarray(const CTensor& tensor, object base,
+    CStatus TensorToNdarray(const shared_ptr[CTensor]& tensor, object base,
                             PyObject** out)
 
     CStatus ConvertArrayToPandas(PandasOptions options,
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index feccebbde..2559c39ef 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -372,6 +372,23 @@ def test_numpy_immutable(large_buffer):
         result[0] = 1.0
 
 
+def test_numpy_base_object(tmpdir):
+    # ARROW-2040: deserialized Numpy array should keep a reference to the
+    # owner of its memory
+    path = os.path.join(str(tmpdir), 'zzz.bin')
+    data = np.arange(12, dtype=np.int32)
+
+    with open(path, 'wb') as f:
+        f.write(pa.serialize(data).to_buffer())
+
+    serialized = pa.read_serialized(pa.OSFile(path))
+    result = serialized.deserialize()
+    assert_equal(result, data)
+    serialized = None
+    assert_equal(result, data)
+    assert result.base is not None
+
+
 # see https://issues.apache.org/jira/browse/ARROW-1695
 def test_serialization_callback_numpy():
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] pyarrow.read_serialized returns bogus data
> ---------------------------------------------------
>
>                 Key: ARROW-2040
>                 URL: https://issues.apache.org/jira/browse/ARROW-2040
>             Project: Apache Arrow
>          Issue Type: Bug
>    Affects Versions: 0.8.0
>            Reporter: Richard Shin
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> pyarrow.deserialize works fine, however.
> {code:python}
> Python 2.7.12 (default, Nov 20 2017, 18:23:56)
> [GCC 5.4.0 20160609] on linux2
> Type "help", "copyright", "credits" or "license" for more information.
> >>> import pyarrow as pa, numpy as np
> >>> with open('test.pyarrow', 'w') as f:
> ...     f.write(pa.serialize(np.arange(10, 
> dtype=np.int32)).to_buffer().to_pybytes())
> ...
> >>> pa.read_serialized(pa.OSFile('test.pyarrow')).deserialize()
> array([54846320, 0, 45484448, 0, 4, 5, 6, 7, 8, 9], dtype=int32)
> >>> pa.deserialize(pa.frombuffer(open('test.pyarrow').read()))
> array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to