This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2f99cf8583 GH-44046: [Python] Fix threading issues with borrowed refs
and pandas (#44047)
2f99cf8583 is described below
commit 2f99cf85835c05e0d3d23d06592e6d2f6322aede
Author: Lysandros Nikolaou <[email protected]>
AuthorDate: Thu Sep 12 18:26:46 2024 +0300
GH-44046: [Python] Fix threading issues with borrowed refs and pandas
(#44047)
### Rationale for this change
Fix threading bugs that could leads to races under the free-threaded build.
### What changes are included in this PR?
- Use `PySequence_ITEM` instead of the `Fast` variant on lists under the
free-threaded build.
- Use `std::once_flag` to make sure that `pandas` staic data only gets
initialized once.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #44046
Lead-authored-by: Lysandros Nikolaou <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/src/arrow/python/helpers.cc | 38 ++++++++++++++++++------
python/pyarrow/src/arrow/python/iterators.h | 4 +++
python/pyarrow/src/arrow/python/numpy_convert.cc | 12 ++++++++
3 files changed, 45 insertions(+), 9 deletions(-)
diff --git a/python/pyarrow/src/arrow/python/helpers.cc
b/python/pyarrow/src/arrow/python/helpers.cc
index 18302e6fe0..ca89ebe9d8 100644
--- a/python/pyarrow/src/arrow/python/helpers.cc
+++ b/python/pyarrow/src/arrow/python/helpers.cc
@@ -22,6 +22,7 @@
#include <cmath>
#include <limits>
+#include <mutex>
#include <sstream>
#include <type_traits>
@@ -292,7 +293,15 @@ bool PyFloat_IsNaN(PyObject* obj) {
namespace {
+// This needs a conditional, because using std::once_flag could introduce
+// a deadlock when the GIL is enabled. See
+//
https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272
for
+// more info.
+#ifdef Py_GIL_DISABLED
+static std::once_flag pandas_static_initialized;
+#else
static bool pandas_static_initialized = false;
+#endif
// Once initialized, these variables hold borrowed references to Pandas static
data.
// We should not use OwnedRef here because Python destructors would be
@@ -304,15 +313,7 @@ static PyObject* pandas_Timestamp = nullptr;
static PyTypeObject* pandas_NaTType = nullptr;
static PyObject* pandas_DateOffset = nullptr;
-} // namespace
-
-void InitPandasStaticData() {
- // NOTE: This is called with the GIL held. We needn't (and shouldn't,
- // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
- if (pandas_static_initialized) {
- return;
- }
-
+void GetPandasStaticSymbols() {
OwnedRef pandas;
// Import pandas
@@ -321,11 +322,14 @@ void InitPandasStaticData() {
return;
}
+#ifndef Py_GIL_DISABLED
// Since ImportModule can release the GIL, another thread could have
// already initialized the static data.
if (pandas_static_initialized) {
return;
}
+#endif
+
OwnedRef ref;
// set NaT sentinel and its type
@@ -355,9 +359,25 @@ void InitPandasStaticData() {
if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
pandas_DateOffset = ref.obj();
}
+}
+
+} // namespace
+#ifdef Py_GIL_DISABLED
+void InitPandasStaticData() {
+ std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
+}
+#else
+void InitPandasStaticData() {
+ // NOTE: This is called with the GIL held. We needn't (and shouldn't,
+ // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
+ if (pandas_static_initialized) {
+ return;
+ }
+ GetPandasStaticSymbols();
pandas_static_initialized = true;
}
+#endif
bool PandasObjectIsNull(PyObject* obj) {
if (!MayHaveNaN(obj)) {
diff --git a/python/pyarrow/src/arrow/python/iterators.h
b/python/pyarrow/src/arrow/python/iterators.h
index 8512276848..dd467f6ac4 100644
--- a/python/pyarrow/src/arrow/python/iterators.h
+++ b/python/pyarrow/src/arrow/python/iterators.h
@@ -67,7 +67,11 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t
offset, VisitorFunc&&
}
if (PySequence_Check(obj)) {
+#ifdef Py_GIL_DISABLED
+ if (PyTuple_Check(obj)) {
+#else
if (PyList_Check(obj) || PyTuple_Check(obj)) {
+#endif
// Use fast item access
const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc
b/python/pyarrow/src/arrow/python/numpy_convert.cc
index 5fd2cb511f..4113cc67d2 100644
--- a/python/pyarrow/src/arrow/python/numpy_convert.cc
+++ b/python/pyarrow/src/arrow/python/numpy_convert.cc
@@ -488,7 +488,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool,
PyObject* data_ao, PyObject*
std::vector<std::shared_ptr<Tensor>> indices(ndim);
for (int i = 0; i < ndim - 1; ++i) {
+#ifdef Py_GIL_DISABLED
+ PyObject* item = PySequence_ITEM(indptr_ao, i);
+ RETURN_IF_PYERROR();
+ OwnedRef item_ref(item);
+#else
PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i);
+#endif
if (!PyArray_Check(item)) {
return Status::TypeError("Did not pass ndarray object for indptr");
}
@@ -497,7 +503,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool,
PyObject* data_ao, PyObject*
}
for (int i = 0; i < ndim; ++i) {
+#ifdef Py_GIL_DISABLED
+ PyObject* item = PySequence_ITEM(indices_ao, i);
+ RETURN_IF_PYERROR();
+ OwnedRef item_ref(item);
+#else
PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i);
+#endif
if (!PyArray_Check(item)) {
return Status::TypeError("Did not pass ndarray object for indices");
}