This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 1c3f618ee38 [SPARK-43872][PS] Support `(DataFrame|Series).plot` with pandas 2.0.0 and above 1c3f618ee38 is described below commit 1c3f618ee388e0830c74117b872144303f40cebf Author: itholic <haejoon....@databricks.com> AuthorDate: Fri Aug 11 11:10:14 2023 +0900 [SPARK-43872][PS] Support `(DataFrame|Series).plot` with pandas 2.0.0 and above ### What changes were proposed in this pull request? This PR proposes to remove parameter `sort_columns` from `(DataFrame|Series).plot` to support pandas 2.0.0. Also enabling the multiple plot tests: - test_area_plot - test_area_plot_stacked_false - test_area_plot_y - test_bar_plot - test_bar_with_x_y - test_barh_plot_with_x_y - test_barh_plot - test_line_plot - test_pie_plot - test_scatter_plot - test_hist_plot - test_kde_plot ### Why are the changes needed? To support pandas 2.0.0 & match the behavior. ### Does this PR introduce _any_ user-facing change? `sort_columns` will no longer available. ### How was this patch tested? Closes #42390 from itholic/remove_sort_columns. Lead-authored-by: itholic <haejoon....@databricks.com> Co-authored-by: Haejoon Lee <44108233+itho...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../source/migration_guide/pyspark_upgrade.rst | 1 + python/pyspark/pandas/plot/matplotlib.py | 13 ----- .../tests/plot/test_frame_plot_matplotlib.py | 56 ---------------------- 3 files changed, 1 insertion(+), 69 deletions(-) diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 98630133e0c..36d073d4a70 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -35,6 +35,7 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, ``include_start`` and ``include_end`` parameters from ``DataFrame.between_time`` have been removed from pandas API on Spark, use ``inclusive`` instead. * In Spark 4.0, ``include_start`` and ``include_end`` parameters from ``Series.between_time`` have been removed from pandas API on Spark, use ``inclusive`` instead. * In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``, ``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas API on Spark. +* In Spark 4.0, ``sort_columns`` parameter from ``DataFrame.plot`` and `Series.plot`` has been removed from pandas API on Spark. Upgrading from PySpark 3.3 to 3.4 diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index 39e862bbae8..36cfc759f83 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -15,7 +15,6 @@ # limitations under the License. # -import warnings from distutils.version import LooseVersion import matplotlib as mat @@ -750,7 +749,6 @@ def plot_frame( yerr=None, xerr=None, secondary_y=False, - sort_columns=False, **kwds, ): """ @@ -836,11 +834,6 @@ def plot_frame( mark_right : boolean, default True When using a secondary_y axis, automatically mark the column labels with "(right)" in the legend - sort_columns: bool, default is False - When True, will sort values on plots. - - .. deprecated:: 3.4.0 - **kwds : keywords Options to pass to matplotlib plotting method @@ -856,11 +849,6 @@ def plot_frame( for bar plot layout by `position` keyword. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) """ - warnings.warn( - "Argument `sort_columns` will be removed in 4.0.0.", - FutureWarning, - ) - return _plot( data, kind=kind, @@ -891,7 +879,6 @@ def plot_frame( sharey=sharey, secondary_y=secondary_y, layout=layout, - sort_columns=sort_columns, **kwds, ) diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py index a47968597b4..365d34b1f55 100644 --- a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +++ b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py @@ -18,7 +18,6 @@ import base64 from io import BytesIO import unittest -from distutils.version import LooseVersion import pandas as pd import numpy as np @@ -79,11 +78,6 @@ class DataFramePlotMatplotlibTestsMixin: plt.close(ax.figure) return b64_data - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43641): Enable DataFramePlotMatplotlibTests.test_line_plot " - "for pandas 2.0.0.", - ) def test_line_plot(self): def check_line_plot(pdf, psdf): ax1 = pdf.plot(kind="line", colormap="Paired") @@ -108,10 +102,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf1.columns = columns check_line_plot(pdf1, psdf1) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43634): Enable DataFramePlotMatplotlibTests.test_area_plot for pandas 2.0.0.", - ) def test_area_plot(self): def check_area_plot(pdf, psdf): ax1 = pdf.plot(kind="area", colormap="Paired") @@ -136,11 +126,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf.columns = columns check_area_plot(pdf, psdf) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43635): Enable DataFramePlotMatplotlibTests.test_area_plot_stacked_false " - "for pandas 2.0.0.", - ) def test_area_plot_stacked_false(self): def check_area_plot_stacked_false(pdf, psdf): ax1 = pdf.plot.area(stacked=False) @@ -168,11 +153,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf.columns = columns check_area_plot_stacked_false(pdf, psdf) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43636): Enable DataFramePlotMatplotlibTests.test_area_plot_y " - "for pandas 2.0.0.", - ) def test_area_plot_y(self): def check_area_plot_y(pdf, psdf, y): ax1 = pdf.plot.area(y=y) @@ -199,11 +179,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf.columns = columns check_area_plot_y(pdf, psdf, y=("x", "sales")) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43639): Enable DataFramePlotMatplotlibTests.test_barh_plot_with_x_y " - "for pandas 2.0.0.", - ) def test_barh_plot_with_x_y(self): def check_barh_plot_with_x_y(pdf, psdf, x, y): ax1 = pdf.plot(kind="barh", x=x, y=y, colormap="Paired") @@ -229,11 +204,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf1.columns = columns check_barh_plot_with_x_y(pdf1, psdf1, x=("x", "lab"), y=("y", "val")) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43640): Enable DataFramePlotMatplotlibTests.test_barh_plot " - "for pandas 2.0.0.", - ) def test_barh_plot(self): def check_barh_plot(pdf, psdf): ax1 = pdf.plot(kind="barh", colormap="Paired") @@ -259,10 +229,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf1.columns = columns check_barh_plot(pdf1, psdf1) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43637): Enable DataFramePlotMatplotlibTests.test_bar_plot " "for pandas 2.0.0.", - ) def test_bar_plot(self): def check_bar_plot(pdf, psdf): ax1 = pdf.plot(kind="bar", colormap="Paired") @@ -287,11 +253,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf1.columns = columns check_bar_plot(pdf1, psdf1) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43638): Enable DataFramePlotMatplotlibTests.test_bar_with_x_y " - "for pandas 2.0.0.", - ) def test_bar_with_x_y(self): # this is testing plot with specified x and y pdf = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]}) @@ -326,10 +287,6 @@ class DataFramePlotMatplotlibTestsMixin: bin8 = self.plot_to_base64(ax8) self.assertEqual(bin7, bin8) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43642): Enable DataFramePlotMatplotlibTests.test_pie_plot " "for pandas 2.0.0.", - ) def test_pie_plot(self): def check_pie_plot(pdf, psdf, y): ax1 = pdf.plot.pie(y=y, figsize=(5, 5), colormap="Paired") @@ -391,11 +348,6 @@ class DataFramePlotMatplotlibTestsMixin: error_message = "pie requires either y column or 'subplots=True'" self.assertTrue(error_message in str(context.exception)) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43643): Enable DataFramePlotMatplotlibTests.test_scatter_plot " - "for pandas 2.0.0.", - ) def test_scatter_plot(self): def check_scatter_plot(pdf, psdf, x, y, c): ax1 = pdf.plot.scatter(x=x, y=y) @@ -428,10 +380,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf1.columns = columns check_scatter_plot(pdf1, psdf1, x=("x", "a"), y=("x", "b"), c=("y", "c")) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43720): Enable DataFramePlotMatplotlibTests.test_hist_plot for pandas 2.0.0.", - ) def test_hist_plot(self): def check_hist_plot(pdf, psdf): _, ax1 = plt.subplots(1, 1) @@ -483,10 +431,6 @@ class DataFramePlotMatplotlibTestsMixin: psdf1.columns = columns check_hist_plot(pdf1, psdf1) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43722): Enable DataFramePlotMatplotlibTests.test_kde_plot for pandas 2.0.0.", - ) def test_kde_plot(self): def moving_average(a, n=10): ret = np.cumsum(a, dtype=float) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org