This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 2b01755f2791 [SPARK-47948][PYTHON] Upgrade the minimum `Pandas` version to 2.0.0 2b01755f2791 is described below commit 2b01755f27917b1d391835e6f8b1b2f9a34cc832 Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Tue Apr 23 07:49:15 2024 -0700 [SPARK-47948][PYTHON] Upgrade the minimum `Pandas` version to 2.0.0 ### What changes were proposed in this pull request? This PR proposes to bump Pandas version up to 2.0.0. ### Why are the changes needed? From Apache Spark 4.0.0, Pandas API on Spark supports Pandas 2.0.0 and above and some of features will be broken from Pandas 1.x, so installing Pandas 2.x is required. See the full list of breaking changes from [Upgrading from PySpark 3.5 to 4.0](https://github.com/apache/spark/blob/master/python/docs/source/migration_guide/pyspark_upgrade.rst#upgrading-from-pyspark-35-to-40). ### Does this PR introduce _any_ user-facing change? No API changes, but the minimum Pandas version from user-facing documentation will be changed. ### How was this patch tested? The existing CI should pass. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46175 from itholic/bump_pandas_2. Authored-by: Haejoon Lee <haejoon....@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- dev/create-release/spark-rm/Dockerfile | 2 +- python/docs/source/getting_started/install.rst | 6 +++--- python/docs/source/migration_guide/pyspark_upgrade.rst | 3 +-- python/docs/source/user_guide/sql/arrow_pandas.rst | 2 +- python/packaging/classic/setup.py | 2 +- python/packaging/connect/setup.py | 2 +- python/pyspark/sql/pandas/utils.py | 2 +- 7 files changed, 9 insertions(+), 10 deletions(-) diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index f51b24d58394..8d5ca38ba88e 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -37,7 +37,7 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true # These arguments are just for reuse and not really meant to be customized. ARG APT_INSTALL="apt-get install --no-install-recommends -y" -ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==10.0.1 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 grpcio-status==1.62.0 googleapis-common-protos==1.56.4" +ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==2.0.3 pyarrow==10.0.1 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 grpcio-status==1.62.0 googleapis-common-protos==1.56.4" ARG GEM_PKGS="bundler:2.3.8" # Install extra needed repos and refresh. diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 08b6cc813cba..33a0560764df 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -205,7 +205,7 @@ Installable with ``pip install "pyspark[connect]"``. ========================== ================= ========================== Package Supported version Note ========================== ================= ========================== -`pandas` >=1.4.4 Required for Spark Connect +`pandas` >=2.0.0 Required for Spark Connect `pyarrow` >=10.0.0 Required for Spark Connect `grpcio` >=1.62.0 Required for Spark Connect `grpcio-status` >=1.62.0 Required for Spark Connect @@ -220,7 +220,7 @@ Installable with ``pip install "pyspark[sql]"``. ========= ================= ====================== Package Supported version Note ========= ================= ====================== -`pandas` >=1.4.4 Required for Spark SQL +`pandas` >=2.0.0 Required for Spark SQL `pyarrow` >=10.0.0 Required for Spark SQL ========= ================= ====================== @@ -233,7 +233,7 @@ Installable with ``pip install "pyspark[pandas_on_spark]"``. ========= ================= ================================ Package Supported version Note ========= ================= ================================ -`pandas` >=1.4.4 Required for Pandas API on Spark +`pandas` >=2.0.0 Required for Pandas API on Spark `pyarrow` >=10.0.0 Required for Pandas API on Spark ========= ================= ================================ diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 36c1eacaf2c7..26fc63430787 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -22,8 +22,7 @@ Upgrading PySpark Upgrading from PySpark 3.5 to 4.0 --------------------------------- -* In Spark 4.0, it is recommended to use Pandas version 2.0.0 or above with PySpark for optimal compatibility. -* In Spark 4.0, the minimum supported version for Pandas has been raised from 1.0.5 to 1.4.4 in PySpark. +* In Spark 4.0, the minimum supported version for Pandas has been raised from 1.0.5 to 2.0.0 in PySpark. * In Spark 4.0, the minimum supported version for Numpy has been raised from 1.15 to 1.21 in PySpark. * In Spark 4.0, the minimum supported version for PyArrow has been raised from 4.0.0 to 10.0.0 in PySpark. * In Spark 4.0, ``Int64Index`` and ``Float64Index`` have been removed from pandas API on Spark, ``Index`` should be used directly. diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/user_guide/sql/arrow_pandas.rst index 039671608b6d..a5dfb9aa4e52 100644 --- a/python/docs/source/user_guide/sql/arrow_pandas.rst +++ b/python/docs/source/user_guide/sql/arrow_pandas.rst @@ -414,7 +414,7 @@ working with timestamps in ``pandas_udf``\s to get the best performance, see Recommended Pandas and PyArrow Versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For usage with pyspark.sql, the minimum supported versions of Pandas is 1.4.4 and PyArrow is 10.0.0. +For usage with pyspark.sql, the minimum supported versions of Pandas is 2.0.0 and PyArrow is 10.0.0. Higher versions may be used, however, compatibility and data correctness can not be guaranteed and should be verified by the user. diff --git a/python/packaging/classic/setup.py b/python/packaging/classic/setup.py index 06137b8595ac..f6d7cf08e5c9 100755 --- a/python/packaging/classic/setup.py +++ b/python/packaging/classic/setup.py @@ -150,7 +150,7 @@ if in_spark: # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst, and # python/packaging/connect/setup.py -_minimum_pandas_version = "1.4.4" +_minimum_pandas_version = "2.0.0" _minimum_numpy_version = "1.21" _minimum_pyarrow_version = "10.0.0" _minimum_grpc_version = "1.62.0" diff --git a/python/packaging/connect/setup.py b/python/packaging/connect/setup.py index 3f2d79a641bc..0a7c82d66a07 100755 --- a/python/packaging/connect/setup.py +++ b/python/packaging/connect/setup.py @@ -117,7 +117,7 @@ try: # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst, and # python/packaging/classic/setup.py - _minimum_pandas_version = "1.4.4" + _minimum_pandas_version = "2.0.0" _minimum_numpy_version = "1.21" _minimum_pyarrow_version = "10.0.0" _minimum_grpc_version = "1.59.3" diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index 654b73e3b93c..fafc3186410c 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -22,7 +22,7 @@ from pyspark.errors import PySparkImportError, PySparkRuntimeError def require_minimum_pandas_version() -> None: """Raise ImportError if minimum version of Pandas is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pandas_version = "1.4.4" + minimum_pandas_version = "2.0.0" try: import pandas --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org