[spark] branch master updated: [SPARK-43872][PS] Support `(DataFrame|Series).plot` with pandas 2.0.0 and above

gurwls223 Thu, 10 Aug 2023 19:10:35 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 1c3f618ee38 [SPARK-43872][PS] Support `(DataFrame|Series).plot` with 
pandas 2.0.0 and above
1c3f618ee38 is described below

commit 1c3f618ee388e0830c74117b872144303f40cebf
Author: itholic <haejoon....@databricks.com>
AuthorDate: Fri Aug 11 11:10:14 2023 +0900

    [SPARK-43872][PS] Support `(DataFrame|Series).plot` with pandas 2.0.0 and 
above
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to remove parameter `sort_columns` from 
`(DataFrame|Series).plot` to support pandas 2.0.0.
    
    Also enabling the multiple plot tests:
    - test_area_plot
    - test_area_plot_stacked_false
    - test_area_plot_y
    - test_bar_plot
    - test_bar_with_x_y
    - test_barh_plot_with_x_y
    - test_barh_plot
    - test_line_plot
    - test_pie_plot
    - test_scatter_plot
    - test_hist_plot
    - test_kde_plot
    
    ### Why are the changes needed?
    
    To support pandas 2.0.0 & match the behavior.
    
    ### Does this PR introduce _any_ user-facing change?
    
    `sort_columns` will no longer available.
    
    ### How was this patch tested?
    
    Closes #42390 from itholic/remove_sort_columns.
    
    Lead-authored-by: itholic <haejoon....@databricks.com>
    Co-authored-by: Haejoon Lee <44108233+itho...@users.noreply.github.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 .../source/migration_guide/pyspark_upgrade.rst     |  1 +
 python/pyspark/pandas/plot/matplotlib.py           | 13 -----
 .../tests/plot/test_frame_plot_matplotlib.py       | 56 ----------------------
 3 files changed, 1 insertion(+), 69 deletions(-)

diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst 
b/python/docs/source/migration_guide/pyspark_upgrade.rst
index 98630133e0c..36d073d4a70 100644
--- a/python/docs/source/migration_guide/pyspark_upgrade.rst
+++ b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -35,6 +35,7 @@ Upgrading from PySpark 3.5 to 4.0
 * In Spark 4.0, ``include_start`` and ``include_end`` parameters from 
``DataFrame.between_time`` have been removed from pandas API on Spark, use 
``inclusive`` instead.
 * In Spark 4.0, ``include_start`` and ``include_end`` parameters from 
``Series.between_time`` have been removed from pandas API on Spark, use 
``inclusive`` instead.
 * In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``, 
``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas 
API on Spark.
+* In Spark 4.0, ``sort_columns`` parameter from ``DataFrame.plot`` and 
`Series.plot`` has been removed from pandas API on Spark.
 
 
 Upgrading from PySpark 3.3 to 3.4
diff --git a/python/pyspark/pandas/plot/matplotlib.py 
b/python/pyspark/pandas/plot/matplotlib.py
index 39e862bbae8..36cfc759f83 100644
--- a/python/pyspark/pandas/plot/matplotlib.py
+++ b/python/pyspark/pandas/plot/matplotlib.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-import warnings
 from distutils.version import LooseVersion
 
 import matplotlib as mat
@@ -750,7 +749,6 @@ def plot_frame(
     yerr=None,
     xerr=None,
     secondary_y=False,
-    sort_columns=False,
     **kwds,
 ):
     """
@@ -836,11 +834,6 @@ def plot_frame(
     mark_right : boolean, default True
         When using a secondary_y axis, automatically mark the column
         labels with "(right)" in the legend
-    sort_columns: bool, default is False
-        When True, will sort values on plots.
-
-        .. deprecated:: 3.4.0
-
     **kwds : keywords
         Options to pass to matplotlib plotting method
 
@@ -856,11 +849,6 @@ def plot_frame(
       for bar plot layout by `position` keyword.
       From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
     """
-    warnings.warn(
-        "Argument `sort_columns` will be removed in 4.0.0.",
-        FutureWarning,
-    )
-
     return _plot(
         data,
         kind=kind,
@@ -891,7 +879,6 @@ def plot_frame(
         sharey=sharey,
         secondary_y=secondary_y,
         layout=layout,
-        sort_columns=sort_columns,
         **kwds,
     )
 
diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py 
b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py
index a47968597b4..365d34b1f55 100644
--- a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py
+++ b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py
@@ -18,7 +18,6 @@
 import base64
 from io import BytesIO
 import unittest
-from distutils.version import LooseVersion
 
 import pandas as pd
 import numpy as np
@@ -79,11 +78,6 @@ class DataFramePlotMatplotlibTestsMixin:
         plt.close(ax.figure)
         return b64_data
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43641): Enable DataFramePlotMatplotlibTests.test_line_plot 
"
-        "for pandas 2.0.0.",
-    )
     def test_line_plot(self):
         def check_line_plot(pdf, psdf):
             ax1 = pdf.plot(kind="line", colormap="Paired")
@@ -108,10 +102,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf1.columns = columns
         check_line_plot(pdf1, psdf1)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43634): Enable DataFramePlotMatplotlibTests.test_area_plot 
for pandas 2.0.0.",
-    )
     def test_area_plot(self):
         def check_area_plot(pdf, psdf):
             ax1 = pdf.plot(kind="area", colormap="Paired")
@@ -136,11 +126,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf.columns = columns
         check_area_plot(pdf, psdf)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43635): Enable 
DataFramePlotMatplotlibTests.test_area_plot_stacked_false "
-        "for pandas 2.0.0.",
-    )
     def test_area_plot_stacked_false(self):
         def check_area_plot_stacked_false(pdf, psdf):
             ax1 = pdf.plot.area(stacked=False)
@@ -168,11 +153,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf.columns = columns
         check_area_plot_stacked_false(pdf, psdf)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43636): Enable 
DataFramePlotMatplotlibTests.test_area_plot_y "
-        "for pandas 2.0.0.",
-    )
     def test_area_plot_y(self):
         def check_area_plot_y(pdf, psdf, y):
             ax1 = pdf.plot.area(y=y)
@@ -199,11 +179,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf.columns = columns
         check_area_plot_y(pdf, psdf, y=("x", "sales"))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43639): Enable 
DataFramePlotMatplotlibTests.test_barh_plot_with_x_y "
-        "for pandas 2.0.0.",
-    )
     def test_barh_plot_with_x_y(self):
         def check_barh_plot_with_x_y(pdf, psdf, x, y):
             ax1 = pdf.plot(kind="barh", x=x, y=y, colormap="Paired")
@@ -229,11 +204,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf1.columns = columns
         check_barh_plot_with_x_y(pdf1, psdf1, x=("x", "lab"), y=("y", "val"))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43640): Enable DataFramePlotMatplotlibTests.test_barh_plot 
"
-        "for pandas 2.0.0.",
-    )
     def test_barh_plot(self):
         def check_barh_plot(pdf, psdf):
             ax1 = pdf.plot(kind="barh", colormap="Paired")
@@ -259,10 +229,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf1.columns = columns
         check_barh_plot(pdf1, psdf1)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43637): Enable DataFramePlotMatplotlibTests.test_bar_plot 
" "for pandas 2.0.0.",
-    )
     def test_bar_plot(self):
         def check_bar_plot(pdf, psdf):
             ax1 = pdf.plot(kind="bar", colormap="Paired")
@@ -287,11 +253,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf1.columns = columns
         check_bar_plot(pdf1, psdf1)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43638): Enable 
DataFramePlotMatplotlibTests.test_bar_with_x_y "
-        "for pandas 2.0.0.",
-    )
     def test_bar_with_x_y(self):
         # this is testing plot with specified x and y
         pdf = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]})
@@ -326,10 +287,6 @@ class DataFramePlotMatplotlibTestsMixin:
         bin8 = self.plot_to_base64(ax8)
         self.assertEqual(bin7, bin8)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43642): Enable DataFramePlotMatplotlibTests.test_pie_plot 
" "for pandas 2.0.0.",
-    )
     def test_pie_plot(self):
         def check_pie_plot(pdf, psdf, y):
             ax1 = pdf.plot.pie(y=y, figsize=(5, 5), colormap="Paired")
@@ -391,11 +348,6 @@ class DataFramePlotMatplotlibTestsMixin:
         error_message = "pie requires either y column or 'subplots=True'"
         self.assertTrue(error_message in str(context.exception))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43643): Enable 
DataFramePlotMatplotlibTests.test_scatter_plot "
-        "for pandas 2.0.0.",
-    )
     def test_scatter_plot(self):
         def check_scatter_plot(pdf, psdf, x, y, c):
             ax1 = pdf.plot.scatter(x=x, y=y)
@@ -428,10 +380,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf1.columns = columns
         check_scatter_plot(pdf1, psdf1, x=("x", "a"), y=("x", "b"), c=("y", 
"c"))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43720): Enable DataFramePlotMatplotlibTests.test_hist_plot 
for pandas 2.0.0.",
-    )
     def test_hist_plot(self):
         def check_hist_plot(pdf, psdf):
             _, ax1 = plt.subplots(1, 1)
@@ -483,10 +431,6 @@ class DataFramePlotMatplotlibTestsMixin:
         psdf1.columns = columns
         check_hist_plot(pdf1, psdf1)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43722): Enable DataFramePlotMatplotlibTests.test_kde_plot 
for pandas 2.0.0.",
-    )
     def test_kde_plot(self):
         def moving_average(a, n=10):
             ret = np.cumsum(a, dtype=float)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-43872][PS] Support `(DataFrame|Series).plot` with pandas 2.0.0 and above

Reply via email to