This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 8cdce28163 GH-32439: [Python] Fix off by one bug when chunking nested
structs (#37376)
8cdce28163 is described below
commit 8cdce281634a29b8f79fcc4725f7fad643fa6f7a
Author: Michael Lui <[email protected]>
AuthorDate: Tue Oct 10 12:59:36 2023 -0400
GH-32439: [Python] Fix off by one bug when chunking nested structs (#37376)
### Rationale for this change
See: #32439
### What changes are included in this PR?
During conversion from Python to Arrow, when a struct's child hits a
capacity error and chunking is triggered, this can leave the Finish'd chunk in
an invalid state since the struct's length does not match the length of its
children.
This change simply tries to Append the children first, and only if
successful will Append the struct. This is safe because the order of Append'ing
between the struct and its child is not specified. It is only specified that
they must be consistent with each other.
This is per:
https://github.com/apache/arrow/blob/86b7a84c9317fa08222eb63f6930bbb54c2e6d0b/cpp/src/arrow/array/builder_nested.h#L507-L508
### Are these changes tested?
A unit test is added that would previously have an invalid data error.
```
> tab = pa.Table.from_pandas(df)
pyarrow/tests/test_pandas.py:4970:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_
pyarrow/table.pxi:3788: in pyarrow.lib.Table.from_pandas
return cls.from_arrays(arrays, schema=schema)
pyarrow/table.pxi:3890: in pyarrow.lib.Table.from_arrays
result.validate()
pyarrow/table.pxi:3170: in pyarrow.lib.Table.validate
check_status(self.table.Validate())
# ...
FAILED pyarrow/tests/test_pandas.py::test_nested_chunking_valid -
pyarrow.lib.ArrowInvalid: Column 0: In chunk 0: Invalid: List child array
invalid: Invalid: Struct child array #0 has length smaller than expected for
struct array (2 < 3)
```
NOTE: This unit test uses about 7GB of memory (max RSS) on my macbook pro.
This might make CI challenging; I'm open to suggestions to limit it.
### Are there any user-facing changes?
No
* Closes: #32439
Lead-authored-by: Mike Lui <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/src/arrow/python/python_to_arrow.cc | 16 +++++---
python/pyarrow/tests/test_pandas.py | 44 ++++++++++++++++++++++
2 files changed, 54 insertions(+), 6 deletions(-)
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 486bd84077..2143129356 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -926,14 +926,14 @@ class PyStructConverter : public
StructConverter<PyConverter, PyConverterTrait>
}
switch (input_kind_) {
case InputKind::DICT:
- RETURN_NOT_OK(this->struct_builder_->Append());
- return AppendDict(value);
+ RETURN_NOT_OK(AppendDict(value));
+ return this->struct_builder_->Append();
case InputKind::TUPLE:
- RETURN_NOT_OK(this->struct_builder_->Append());
- return AppendTuple(value);
+ RETURN_NOT_OK(AppendTuple(value));
+ return this->struct_builder_->Append();
case InputKind::ITEMS:
- RETURN_NOT_OK(this->struct_builder_->Append());
- return AppendItems(value);
+ RETURN_NOT_OK(AppendItems(value));
+ return this->struct_builder_->Append();
default:
RETURN_NOT_OK(InferInputKind(value));
return Append(value);
@@ -944,6 +944,10 @@ class PyStructConverter : public
StructConverter<PyConverter, PyConverterTrait>
Status Init(MemoryPool* pool) override {
RETURN_NOT_OK((StructConverter<PyConverter,
PyConverterTrait>::Init(pool)));
+ // This implementation will check the child values before appending itself,
+ // so no rewind is necessary
+ this->rewind_on_overflow_ = false;
+
// Store the field names as a PyObjects for dict matching
num_fields_ = this->struct_type_->num_fields();
bytes_field_names_.reset(PyList_New(num_fields_));
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 0d01928f44..62a9443953 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1691,6 +1691,7 @@ class TestConvertStringLikeTypes:
'strings': [[v1]] * 20 + [[v2]] + [[b'x']]
})
arr = pa.array(df['strings'], from_pandas=True)
+ arr.validate(full=True)
assert isinstance(arr, pa.ChunkedArray)
assert arr.num_chunks == 2
assert len(arr.chunk(0)) == 21
@@ -2381,6 +2382,7 @@ class TestConvertListTypes:
"b": range(n)
})
table = pa.Table.from_pandas(df)
+ table.validate(full=True)
column_a = table[0]
assert column_a.num_chunks == 2
@@ -2623,6 +2625,7 @@ class TestConvertStructTypes:
ty = pa.struct([pa.field('x', pa.float64()),
pa.field('y', pa.binary())])
arr = pa.array(data, type=ty, from_pandas=True)
+ arr.validate(full=True)
assert arr.num_chunks == 2
def iter_chunked_array(arr):
@@ -2655,6 +2658,7 @@ class TestConvertStructTypes:
# Now with explicit mask
mask = np.random.random_sample(n) < 0.2
arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
+ arr.validate(full=True)
assert arr.num_chunks == 2
check(arr, data, mask)
@@ -4826,6 +4830,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced():
def assert_roundtrip(series: pd.Series, data) -> None:
array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series,
type=ty))
+ array_roundtrip.validate(full=True)
assert data.equals(array_roundtrip)
assert_roundtrip(series_default, chunked_array)
@@ -4944,3 +4949,42 @@ def test_array_conversion_for_datetime():
result = arr.to_pandas()
tm.assert_series_equal(result, series)
+
+
[email protected]_memory
+def test_nested_chunking_valid():
+ # GH-32439: Chunking can cause arrays to be in invalid state
+ # when nested types are involved.
+ # Here we simply ensure we validate correctly.
+
+ def roundtrip(df, schema=None):
+ tab = pa.Table.from_pandas(df, schema=schema)
+ tab.validate(full=True)
+ # we expect to trigger chunking internally
+ # an assertion failure here may just mean this threshold has changed
+ num_chunks = tab.column(0).num_chunks
+ assert num_chunks > 1
+ tm.assert_frame_equal(tab.to_pandas(self_destruct=True,
+ maps_as_pydicts="strict"), df)
+
+ x = b"0" * 720000000
+ roundtrip(pd.DataFrame({"strings": [x, x, x]}))
+
+ struct = {"struct_field": x}
+ roundtrip(pd.DataFrame({"structs": [struct, struct, struct]}))
+
+ lists = [x]
+ roundtrip(pd.DataFrame({"lists": [lists, lists, lists]}))
+
+ los = [struct]
+ roundtrip(pd.DataFrame({"los": [los, los, los]}))
+
+ sol = {"struct_field": lists}
+ roundtrip(pd.DataFrame({"sol": [sol, sol, sol]}))
+
+ map_of_los = {"a": los}
+ map_type = pa.map_(pa.string(),
+ pa.list_(pa.struct([("struct_field", pa.binary())])))
+ schema = pa.schema([("maps", map_type)])
+ roundtrip(pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}),
+ schema=schema)