[arrow] branch main updated: GH-32439: [Python] Fix off by one bug when chunking nested structs (#37376)

apitrou Tue, 10 Oct 2023 09:59:49 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 8cdce28163 GH-32439: [Python] Fix off by one bug when chunking nested 
structs (#37376)
8cdce28163 is described below

commit 8cdce281634a29b8f79fcc4725f7fad643fa6f7a
Author: Michael Lui <[email protected]>
AuthorDate: Tue Oct 10 12:59:36 2023 -0400

    GH-32439: [Python] Fix off by one bug when chunking nested structs (#37376)
    
    
    ### Rationale for this change
    
    See: #32439
    
    ### What changes are included in this PR?
    
    During conversion from Python to Arrow, when a struct's child hits a 
capacity error and chunking is triggered, this can leave the Finish'd chunk in 
an invalid state since the struct's length does not match the length of its 
children.
    
    This change simply tries to Append the children first, and only if 
successful will Append the struct. This is safe because the order of Append'ing 
between the struct and its child is not specified. It is only specified that 
they must be consistent with each other.
    
    This is per:
    
    
https://github.com/apache/arrow/blob/86b7a84c9317fa08222eb63f6930bbb54c2e6d0b/cpp/src/arrow/array/builder_nested.h#L507-L508
    
    ### Are these changes tested?
    
    A unit test is added that would previously have an invalid data error.
    
    ```
    >       tab = pa.Table.from_pandas(df)
    
    pyarrow/tests/test_pandas.py:4970:
    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_
    pyarrow/table.pxi:3788: in pyarrow.lib.Table.from_pandas
        return cls.from_arrays(arrays, schema=schema)
    pyarrow/table.pxi:3890: in pyarrow.lib.Table.from_arrays
        result.validate()
    pyarrow/table.pxi:3170: in pyarrow.lib.Table.validate
        check_status(self.table.Validate())
    
    # ...
    
    FAILED pyarrow/tests/test_pandas.py::test_nested_chunking_valid - 
pyarrow.lib.ArrowInvalid: Column 0: In chunk 0: Invalid: List child array 
invalid: Invalid: Struct child array #0 has length smaller than expected for 
struct array (2 < 3)
    ```
    
    NOTE: This unit test uses about 7GB of memory (max RSS) on my macbook pro. 
This might make CI challenging; I'm open to suggestions to limit it.
    
    ### Are there any user-facing changes?
    
    No
    * Closes: #32439
    
    Lead-authored-by: Mike Lui <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/src/arrow/python/python_to_arrow.cc | 16 +++++---
 python/pyarrow/tests/test_pandas.py                | 44 ++++++++++++++++++++++
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc 
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 486bd84077..2143129356 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -926,14 +926,14 @@ class PyStructConverter : public 
StructConverter<PyConverter, PyConverterTrait>
     }
     switch (input_kind_) {
       case InputKind::DICT:
-        RETURN_NOT_OK(this->struct_builder_->Append());
-        return AppendDict(value);
+        RETURN_NOT_OK(AppendDict(value));
+        return this->struct_builder_->Append();
       case InputKind::TUPLE:
-        RETURN_NOT_OK(this->struct_builder_->Append());
-        return AppendTuple(value);
+        RETURN_NOT_OK(AppendTuple(value));
+        return this->struct_builder_->Append();
       case InputKind::ITEMS:
-        RETURN_NOT_OK(this->struct_builder_->Append());
-        return AppendItems(value);
+        RETURN_NOT_OK(AppendItems(value));
+        return this->struct_builder_->Append();
       default:
         RETURN_NOT_OK(InferInputKind(value));
         return Append(value);
@@ -944,6 +944,10 @@ class PyStructConverter : public 
StructConverter<PyConverter, PyConverterTrait>
   Status Init(MemoryPool* pool) override {
     RETURN_NOT_OK((StructConverter<PyConverter, 
PyConverterTrait>::Init(pool)));
 
+    // This implementation will check the child values before appending itself,
+    // so no rewind is necessary
+    this->rewind_on_overflow_ = false;
+
     // Store the field names as a PyObjects for dict matching
     num_fields_ = this->struct_type_->num_fields();
     bytes_field_names_.reset(PyList_New(num_fields_));
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 0d01928f44..62a9443953 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1691,6 +1691,7 @@ class TestConvertStringLikeTypes:
             'strings': [[v1]] * 20 + [[v2]] + [[b'x']]
         })
         arr = pa.array(df['strings'], from_pandas=True)
+        arr.validate(full=True)
         assert isinstance(arr, pa.ChunkedArray)
         assert arr.num_chunks == 2
         assert len(arr.chunk(0)) == 21
@@ -2381,6 +2382,7 @@ class TestConvertListTypes:
             "b": range(n)
         })
         table = pa.Table.from_pandas(df)
+        table.validate(full=True)
 
         column_a = table[0]
         assert column_a.num_chunks == 2
@@ -2623,6 +2625,7 @@ class TestConvertStructTypes:
         ty = pa.struct([pa.field('x', pa.float64()),
                         pa.field('y', pa.binary())])
         arr = pa.array(data, type=ty, from_pandas=True)
+        arr.validate(full=True)
         assert arr.num_chunks == 2
 
         def iter_chunked_array(arr):
@@ -2655,6 +2658,7 @@ class TestConvertStructTypes:
         # Now with explicit mask
         mask = np.random.random_sample(n) < 0.2
         arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
+        arr.validate(full=True)
         assert arr.num_chunks == 2
 
         check(arr, data, mask)
@@ -4826,6 +4830,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced():
 
     def assert_roundtrip(series: pd.Series, data) -> None:
         array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, 
type=ty))
+        array_roundtrip.validate(full=True)
         assert data.equals(array_roundtrip)
 
     assert_roundtrip(series_default, chunked_array)
@@ -4944,3 +4949,42 @@ def test_array_conversion_for_datetime():
 
     result = arr.to_pandas()
     tm.assert_series_equal(result, series)
+
+
[email protected]_memory
+def test_nested_chunking_valid():
+    # GH-32439: Chunking can cause arrays to be in invalid state
+    # when nested types are involved.
+    # Here we simply ensure we validate correctly.
+
+    def roundtrip(df, schema=None):
+        tab = pa.Table.from_pandas(df, schema=schema)
+        tab.validate(full=True)
+        # we expect to trigger chunking internally
+        # an assertion failure here may just mean this threshold has changed
+        num_chunks = tab.column(0).num_chunks
+        assert num_chunks > 1
+        tm.assert_frame_equal(tab.to_pandas(self_destruct=True,
+                                            maps_as_pydicts="strict"), df)
+
+    x = b"0" * 720000000
+    roundtrip(pd.DataFrame({"strings": [x, x, x]}))
+
+    struct = {"struct_field": x}
+    roundtrip(pd.DataFrame({"structs": [struct, struct, struct]}))
+
+    lists = [x]
+    roundtrip(pd.DataFrame({"lists": [lists, lists, lists]}))
+
+    los = [struct]
+    roundtrip(pd.DataFrame({"los": [los, los, los]}))
+
+    sol = {"struct_field": lists}
+    roundtrip(pd.DataFrame({"sol": [sol, sol, sol]}))
+
+    map_of_los = {"a": los}
+    map_type = pa.map_(pa.string(),
+                       pa.list_(pa.struct([("struct_field", pa.binary())])))
+    schema = pa.schema([("maps", map_type)])
+    roundtrip(pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}),
+              schema=schema)

[arrow] branch main updated: GH-32439: [Python] Fix off by one bug when chunking nested structs (#37376)

Reply via email to