This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 04bf981781ba [SPARK-47737][PYTHON] Bump PyArrow to 10.0.0 04bf981781ba is described below commit 04bf981781ba79d4b2d5a493ea32935eaa177709 Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Mon Apr 8 09:44:49 2024 -0700 [SPARK-47737][PYTHON] Bump PyArrow to 10.0.0 ### What changes were proposed in this pull request? This PR proposes to bump PyArrow version up to 10.0.0 ### Why are the changes needed? To leverage the new features from the latest version. ### Does this PR introduce _any_ user-facing change? No API changes, but the PyArrow version from user-facing documentation will be changed. ### How was this patch tested? The existing CI should pass. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45892 from itholic/bump_arrow_10. Authored-by: Haejoon Lee <haejoon....@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- dev/create-release/spark-rm/Dockerfile | 2 +- python/docs/source/getting_started/install.rst | 2 +- python/docs/source/migration_guide/pyspark_upgrade.rst | 1 + python/docs/source/user_guide/sql/arrow_pandas.rst | 2 +- python/packaging/classic/setup.py | 2 +- python/packaging/connect/setup.py | 2 +- python/pyspark/sql/pandas/utils.py | 2 +- 7 files changed, 7 insertions(+), 6 deletions(-) diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 2cd50999c4cc..f51b24d58394 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -37,7 +37,7 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true # These arguments are just for reuse and not really meant to be customized. ARG APT_INSTALL="apt-get install --no-install-recommends -y" -ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==3.0.0 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 grpcio-status==1.62.0 googleapis-common-protos==1.56.4" +ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==10.0.1 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 grpcio-status==1.62.0 googleapis-common-protos==1.56.4" ARG GEM_PKGS="bundler:2.3.8" # Install extra needed repos and refresh. diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 6aa89a689480..4c0551433d5a 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -157,7 +157,7 @@ Package Supported version Note ========================== ========================= ====================================================================================== `py4j` >=0.10.9.7 Required `pandas` >=1.4.4 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL -`pyarrow` >=4.0.0 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL +`pyarrow` >=10.0.0 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL `numpy` >=1.21 Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL `grpcio` >=1.62.0 Required for Spark Connect `grpcio-status` >=1.62.0 Required for Spark Connect diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 1ca5d7aad5d1..36c1eacaf2c7 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -25,6 +25,7 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, it is recommended to use Pandas version 2.0.0 or above with PySpark for optimal compatibility. * In Spark 4.0, the minimum supported version for Pandas has been raised from 1.0.5 to 1.4.4 in PySpark. * In Spark 4.0, the minimum supported version for Numpy has been raised from 1.15 to 1.21 in PySpark. +* In Spark 4.0, the minimum supported version for PyArrow has been raised from 4.0.0 to 10.0.0 in PySpark. * In Spark 4.0, ``Int64Index`` and ``Float64Index`` have been removed from pandas API on Spark, ``Index`` should be used directly. * In Spark 4.0, ``DataFrame.iteritems`` has been removed from pandas API on Spark, use ``DataFrame.items`` instead. * In Spark 4.0, ``Series.iteritems`` has been removed from pandas API on Spark, use ``Series.items`` instead. diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/user_guide/sql/arrow_pandas.rst index ce7c8ebb36e3..039671608b6d 100644 --- a/python/docs/source/user_guide/sql/arrow_pandas.rst +++ b/python/docs/source/user_guide/sql/arrow_pandas.rst @@ -414,7 +414,7 @@ working with timestamps in ``pandas_udf``\s to get the best performance, see Recommended Pandas and PyArrow Versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For usage with pyspark.sql, the minimum supported versions of Pandas is 1.4.4 and PyArrow is 4.0.0. +For usage with pyspark.sql, the minimum supported versions of Pandas is 1.4.4 and PyArrow is 10.0.0. Higher versions may be used, however, compatibility and data correctness can not be guaranteed and should be verified by the user. diff --git a/python/packaging/classic/setup.py b/python/packaging/classic/setup.py index ddd2448e1c18..8eefc17db700 100755 --- a/python/packaging/classic/setup.py +++ b/python/packaging/classic/setup.py @@ -152,7 +152,7 @@ if in_spark: # python/packaging/connect/setup.py _minimum_pandas_version = "1.4.4" _minimum_numpy_version = "1.21" -_minimum_pyarrow_version = "4.0.0" +_minimum_pyarrow_version = "10.0.0" _minimum_grpc_version = "1.62.0" _minimum_googleapis_common_protos_version = "1.56.4" diff --git a/python/packaging/connect/setup.py b/python/packaging/connect/setup.py index 782c55fff241..3514e5cdc422 100755 --- a/python/packaging/connect/setup.py +++ b/python/packaging/connect/setup.py @@ -91,7 +91,7 @@ try: # python/packaging/classic/setup.py _minimum_pandas_version = "1.4.4" _minimum_numpy_version = "1.21" - _minimum_pyarrow_version = "4.0.0" + _minimum_pyarrow_version = "10.0.0" _minimum_grpc_version = "1.59.3" _minimum_googleapis_common_protos_version = "1.56.4" diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index ff8183c61746..654b73e3b93c 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -61,7 +61,7 @@ def require_minimum_pandas_version() -> None: def require_minimum_pyarrow_version() -> None: """Raise ImportError if minimum version of pyarrow is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pyarrow_version = "4.0.0" + minimum_pyarrow_version = "10.0.0" import os --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org