[spark] branch master updated: [SPARK-38946][PYTHON][PS] Generates a new dataframe instead of operating inplace in setitem

gurwls223 Wed, 17 Aug 2022 20:35:41 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 532c5005f2f [SPARK-38946][PYTHON][PS] Generates a new dataframe 
instead of operating inplace in setitem
532c5005f2f is described below

commit 532c5005f2fd82d714d14a815f435ef48fecc205
Author: Yikun Jiang <yikunk...@gmail.com>
AuthorDate: Thu Aug 18 12:34:51 2022 +0900

    [SPARK-38946][PYTHON][PS] Generates a new dataframe instead of operating 
inplace in setitem
    
    ### What changes were proposed in this pull request?
    
    Generates a new dataframe instead of operating inplace in setitem
    
    ### Why are the changes needed?
    Make CI passed in with pandas 1.4.3
    
    Since pandas 1.4.0 
https://github.com/pandas-dev/pandas/commit/03dd698bc1e84c35aba8b51bdd45c472860b9ec3
 , dataframe.setitem should always make a copy and never write into the 
existing array.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    CI test with current pandas (1.3.x) and latest pandas 1.4.2, 1.4.3
    
    Closes #36353 from Yikun/SPARK-38946.
    
    Authored-by: Yikun Jiang <yikunk...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 .../source/migration_guide/pyspark_3.3_to_3.4.rst  |  2 +
 python/pyspark/pandas/frame.py                     | 26 +++++++++----
 python/pyspark/pandas/tests/test_dataframe.py      | 43 ++++++++++++++++------
 3 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst 
b/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst
index dbe7b818b2a..b3baa8345aa 100644
--- a/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst
+++ b/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst
@@ -37,3 +37,5 @@ Upgrading from PySpark 3.3 to 3.4
 * In Spark 3.4, the infer schema process of ``groupby.apply`` in Pandas on 
Spark, will first infer the pandas type to ensure the accuracy of the pandas 
``dtype`` as much as possible.
 
 * In Spark 3.4, the ``Series.concat`` sort parameter will be respected to 
follow pandas 1.4 behaviors.
+
+* In Spark 3.4, the ``DataFrame.__setitem__`` will make a copy and replace 
pre-existing arrays, which will NOT be over-written to follow pandas 1.4 
behaviors.
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index b3ded9885fc..fb4c3368057 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -498,20 +498,30 @@ class DataFrame(Frame, Generic[T]):
         return cast(InternalFrame, self._internal_frame)  # type: 
ignore[has-type]
 
     def _update_internal_frame(
-        self, internal: InternalFrame, requires_same_anchor: bool = True
+        self,
+        internal: InternalFrame,
+        requires_same_anchor: bool = True,
+        anchor_force_disconnect: bool = False,
     ) -> None:
         """
         Update InternalFrame with the given one.
 
-        If the column_label is changed or the new InternalFrame is not the 
same `anchor`,
-        disconnect the link to the Series and create a new one.
+        If the column_label is changed or the new InternalFrame is not the 
same `anchor` or the
+        `anchor_force_disconnect` flag is set to True, disconnect the original 
anchor and create
+        a new one.
 
         If `requires_same_anchor` is `False`, checking whether or not the same 
anchor is ignored
         and force to update the InternalFrame, e.g., replacing the internal 
with the resolved_copy,
         updating the underlying Spark DataFrame which need to combine a 
different Spark DataFrame.
 
-        :param internal: the new InternalFrame
-        :param requires_same_anchor: whether checking the same anchor
+        Parameters
+        ----------
+        internal : InternalFrame
+            The new InternalFrame
+        requires_same_anchor : bool
+            Whether checking the same anchor
+        anchor_force_disconnect : bool
+            Force to disconnect the original anchor and create a new one
         """
         from pyspark.pandas.series import Series
 
@@ -527,7 +537,7 @@ class DataFrame(Frame, Generic[T]):
                     renamed = old_label != new_label
                     not_same_anchor = requires_same_anchor and not 
same_anchor(internal, psser)
 
-                    if renamed or not_same_anchor:
+                    if renamed or not_same_anchor or anchor_force_disconnect:
                         psdf: DataFrame = 
DataFrame(self._internal.select_column(old_label))
                         psser._update_anchor(psdf)
                         psser = None
@@ -12903,7 +12913,9 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
             # Same Series.
             psdf = self._assign({key: value})
 
-        self._update_internal_frame(psdf._internal)
+        # Since Spark 3.4, df.__setitem__ generates a new dataframe instead of 
operating
+        # in-place to follow pandas v1.4 behavior, see also SPARK-38946.
+        self._update_internal_frame(psdf._internal, 
anchor_force_disconnect=True)
 
     @staticmethod
     def _index_normalized_label(level: int, labels: Union[Name, 
Sequence[Name]]) -> List[Label]:
diff --git a/python/pyspark/pandas/tests/test_dataframe.py 
b/python/pyspark/pandas/tests/test_dataframe.py
index add93faba0c..8bccc108b23 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -290,7 +290,14 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
         psdf["a"] = psdf["a"] + 10
 
         self.assert_eq(psdf, pdf)
-        self.assert_eq(psser, pser)
+        # SPARK-38946: Since Spark 3.4, df.__setitem__ generate a new 
dataframe to follow
+        # pandas 1.4 behaviors
+        if LooseVersion(pd.__version__) >= LooseVersion("1.4.0"):
+            self.assert_eq(psser, pser)
+        else:
+            # Follow pandas latest behavior
+            with self.assertRaisesRegex(AssertionError, "Series are 
different"):
+                self.assert_eq(psser, pser)
 
     def test_assign_list(self):
         pdf, psdf = self.df_pair
@@ -1493,6 +1500,15 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
         pdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
         psdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True)
         self.assert_eq(psdf, pdf)
+        # Skip due to pandas bug: 
https://github.com/pandas-dev/pandas/issues/47188
+        if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= 
LooseVersion("1.4.2")):
+            self.assert_eq(psser, pser)
+
+        pser = pdf.z
+        psser = psdf.z
+        pdf.fillna(0, inplace=True)
+        psdf.fillna(0, inplace=True)
+        self.assert_eq(psdf, pdf)
         self.assert_eq(psser, pser)
 
         s_nan = pd.Series([-1, -2, -5], index=["x", "y", "z"], dtype=int)
@@ -1536,14 +1552,15 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
         self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
         self.assert_eq(pdf.fillna(method="bfill", limit=2), 
psdf.fillna(method="bfill", limit=2))
 
-        self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1}))
-
-        self.assert_eq(
-            psdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1, ("x", 
"b"): -2})
-        )
-        self.assert_eq(
-            psdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x", "b"): 
-2, "x": -1})
-        )
+        # See also: https://github.com/pandas-dev/pandas/issues/47649
+        if LooseVersion("1.4.3") != LooseVersion(pd.__version__):
+            self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1}))
+            self.assert_eq(
+                psdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1, 
("x", "b"): -2})
+            )
+            self.assert_eq(
+                psdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x", 
"b"): -2, "x": -1})
+            )
 
         # check multi index
         pdf = pdf.set_index([("x", "a"), ("x", "b")])
@@ -2972,7 +2989,9 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
         left_pdf.update(right_pdf)
         left_psdf.update(right_psdf)
         self.assert_eq(left_pdf.sort_values(by=["A", "B"]), 
left_psdf.sort_values(by=["A", "B"]))
-        self.assert_eq(psser.sort_index(), pser.sort_index())
+        # Skip due to pandas bug: 
https://github.com/pandas-dev/pandas/issues/47188
+        if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= 
LooseVersion("1.4.2")):
+            self.assert_eq(psser.sort_index(), pser.sort_index())
 
         left_psdf, left_pdf, right_psdf, right_pdf = get_data()
         left_pdf.update(right_pdf, overwrite=False)
@@ -5243,7 +5262,9 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
         pdf.eval("A = B + C", inplace=True)
         psdf.eval("A = B + C", inplace=True)
         self.assert_eq(pdf, psdf)
-        self.assert_eq(pser, psser)
+        # Skip due to pandas bug: 
https://github.com/pandas-dev/pandas/issues/47449
+        if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= 
LooseVersion("1.4.3")):
+            self.assert_eq(pser, psser)
 
         # doesn't support for multi-index columns
         columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b"), ("z", 
"c")])


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-38946][PYTHON][PS] Generates a new dataframe instead of operating inplace in setitem

Reply via email to