This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 532c5005f2f [SPARK-38946][PYTHON][PS] Generates a new dataframe instead of operating inplace in setitem 532c5005f2f is described below commit 532c5005f2fd82d714d14a815f435ef48fecc205 Author: Yikun Jiang <yikunk...@gmail.com> AuthorDate: Thu Aug 18 12:34:51 2022 +0900 [SPARK-38946][PYTHON][PS] Generates a new dataframe instead of operating inplace in setitem ### What changes were proposed in this pull request? Generates a new dataframe instead of operating inplace in setitem ### Why are the changes needed? Make CI passed in with pandas 1.4.3 Since pandas 1.4.0 https://github.com/pandas-dev/pandas/commit/03dd698bc1e84c35aba8b51bdd45c472860b9ec3 , dataframe.setitem should always make a copy and never write into the existing array. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI test with current pandas (1.3.x) and latest pandas 1.4.2, 1.4.3 Closes #36353 from Yikun/SPARK-38946. Authored-by: Yikun Jiang <yikunk...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../source/migration_guide/pyspark_3.3_to_3.4.rst | 2 + python/pyspark/pandas/frame.py | 26 +++++++++---- python/pyspark/pandas/tests/test_dataframe.py | 43 ++++++++++++++++------ 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst b/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst index dbe7b818b2a..b3baa8345aa 100644 --- a/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst +++ b/python/docs/source/migration_guide/pyspark_3.3_to_3.4.rst @@ -37,3 +37,5 @@ Upgrading from PySpark 3.3 to 3.4 * In Spark 3.4, the infer schema process of ``groupby.apply`` in Pandas on Spark, will first infer the pandas type to ensure the accuracy of the pandas ``dtype`` as much as possible. * In Spark 3.4, the ``Series.concat`` sort parameter will be respected to follow pandas 1.4 behaviors. + +* In Spark 3.4, the ``DataFrame.__setitem__`` will make a copy and replace pre-existing arrays, which will NOT be over-written to follow pandas 1.4 behaviors. diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index b3ded9885fc..fb4c3368057 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -498,20 +498,30 @@ class DataFrame(Frame, Generic[T]): return cast(InternalFrame, self._internal_frame) # type: ignore[has-type] def _update_internal_frame( - self, internal: InternalFrame, requires_same_anchor: bool = True + self, + internal: InternalFrame, + requires_same_anchor: bool = True, + anchor_force_disconnect: bool = False, ) -> None: """ Update InternalFrame with the given one. - If the column_label is changed or the new InternalFrame is not the same `anchor`, - disconnect the link to the Series and create a new one. + If the column_label is changed or the new InternalFrame is not the same `anchor` or the + `anchor_force_disconnect` flag is set to True, disconnect the original anchor and create + a new one. If `requires_same_anchor` is `False`, checking whether or not the same anchor is ignored and force to update the InternalFrame, e.g., replacing the internal with the resolved_copy, updating the underlying Spark DataFrame which need to combine a different Spark DataFrame. - :param internal: the new InternalFrame - :param requires_same_anchor: whether checking the same anchor + Parameters + ---------- + internal : InternalFrame + The new InternalFrame + requires_same_anchor : bool + Whether checking the same anchor + anchor_force_disconnect : bool + Force to disconnect the original anchor and create a new one """ from pyspark.pandas.series import Series @@ -527,7 +537,7 @@ class DataFrame(Frame, Generic[T]): renamed = old_label != new_label not_same_anchor = requires_same_anchor and not same_anchor(internal, psser) - if renamed or not_same_anchor: + if renamed or not_same_anchor or anchor_force_disconnect: psdf: DataFrame = DataFrame(self._internal.select_column(old_label)) psser._update_anchor(psdf) psser = None @@ -12903,7 +12913,9 @@ defaultdict(<class 'list'>, {'col..., 'col...})] # Same Series. psdf = self._assign({key: value}) - self._update_internal_frame(psdf._internal) + # Since Spark 3.4, df.__setitem__ generates a new dataframe instead of operating + # in-place to follow pandas v1.4 behavior, see also SPARK-38946. + self._update_internal_frame(psdf._internal, anchor_force_disconnect=True) @staticmethod def _index_normalized_label(level: int, labels: Union[Name, Sequence[Name]]) -> List[Label]: diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index add93faba0c..8bccc108b23 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -290,7 +290,14 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): psdf["a"] = psdf["a"] + 10 self.assert_eq(psdf, pdf) - self.assert_eq(psser, pser) + # SPARK-38946: Since Spark 3.4, df.__setitem__ generate a new dataframe to follow + # pandas 1.4 behaviors + if LooseVersion(pd.__version__) >= LooseVersion("1.4.0"): + self.assert_eq(psser, pser) + else: + # Follow pandas latest behavior + with self.assertRaisesRegex(AssertionError, "Series are different"): + self.assert_eq(psser, pser) def test_assign_list(self): pdf, psdf = self.df_pair @@ -1493,6 +1500,15 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): pdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True) psdf.fillna({"x": -1, "y": -2, "z": -5}, inplace=True) self.assert_eq(psdf, pdf) + # Skip due to pandas bug: https://github.com/pandas-dev/pandas/issues/47188 + if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= LooseVersion("1.4.2")): + self.assert_eq(psser, pser) + + pser = pdf.z + psser = psdf.z + pdf.fillna(0, inplace=True) + psdf.fillna(0, inplace=True) + self.assert_eq(psdf, pdf) self.assert_eq(psser, pser) s_nan = pd.Series([-1, -2, -5], index=["x", "y", "z"], dtype=int) @@ -1536,14 +1552,15 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill")) self.assert_eq(pdf.fillna(method="bfill", limit=2), psdf.fillna(method="bfill", limit=2)) - self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1})) - - self.assert_eq( - psdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1, ("x", "b"): -2}) - ) - self.assert_eq( - psdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x", "b"): -2, "x": -1}) - ) + # See also: https://github.com/pandas-dev/pandas/issues/47649 + if LooseVersion("1.4.3") != LooseVersion(pd.__version__): + self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1})) + self.assert_eq( + psdf.fillna({"x": -1, ("x", "b"): -2}), pdf.fillna({"x": -1, ("x", "b"): -2}) + ) + self.assert_eq( + psdf.fillna({("x", "b"): -2, "x": -1}), pdf.fillna({("x", "b"): -2, "x": -1}) + ) # check multi index pdf = pdf.set_index([("x", "a"), ("x", "b")]) @@ -2972,7 +2989,9 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): left_pdf.update(right_pdf) left_psdf.update(right_psdf) self.assert_eq(left_pdf.sort_values(by=["A", "B"]), left_psdf.sort_values(by=["A", "B"])) - self.assert_eq(psser.sort_index(), pser.sort_index()) + # Skip due to pandas bug: https://github.com/pandas-dev/pandas/issues/47188 + if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= LooseVersion("1.4.2")): + self.assert_eq(psser.sort_index(), pser.sort_index()) left_psdf, left_pdf, right_psdf, right_pdf = get_data() left_pdf.update(right_pdf, overwrite=False) @@ -5243,7 +5262,9 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): pdf.eval("A = B + C", inplace=True) psdf.eval("A = B + C", inplace=True) self.assert_eq(pdf, psdf) - self.assert_eq(pser, psser) + # Skip due to pandas bug: https://github.com/pandas-dev/pandas/issues/47449 + if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= LooseVersion("1.4.3")): + self.assert_eq(pser, psser) # doesn't support for multi-index columns columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b"), ("z", "c")]) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org