This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 90f8039 [SPARK-28041][PYTHON] Increase minimum supported Pandas to 0.23.2 90f8039 is described below commit 90f80395af629f5c19d3f552c54b2cc63eb7a76a Author: Bryan Cutler <cutl...@gmail.com> AuthorDate: Tue Jun 18 09:10:58 2019 +0900 [SPARK-28041][PYTHON] Increase minimum supported Pandas to 0.23.2 ## What changes were proposed in this pull request? This increases the minimum supported version of Pandas to 0.23.2. Using a lower version will raise an error `Pandas >= 0.23.2 must be installed; however, your version was 0.XX`. Also, a workaround for using pyarrow with Pandas 0.19.2 was removed. ## How was this patch tested? Existing Tests Closes #24867 from BryanCutler/pyspark-increase-min-pandas-SPARK-28041. Authored-by: Bryan Cutler <cutl...@gmail.com> Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- docs/sql-migration-guide-upgrade.md | 4 ++++ python/pyspark/serializers.py | 2 -- python/pyspark/sql/tests/test_arrow.py | 4 ++-- python/pyspark/sql/utils.py | 2 +- python/setup.py | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md index 37be86f..b062a04 100644 --- a/docs/sql-migration-guide-upgrade.md +++ b/docs/sql-migration-guide-upgrade.md @@ -23,6 +23,10 @@ license: | {:toc} ## Upgrading From Spark SQL 2.4 to 3.0 + - Since Spark 3.0, PySpark requires a Pandas version of 0.23.2 or higher to use Pandas related functionality, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc. + + - Since Spark 3.0, PySpark requires a PyArrow version of 0.12.1 or higher to use PyArrow related functionality, such as `pandas_udf`, `toPandas` and `createDataFrame` with "spark.sql.execution.arrow.enabled=true", etc. + - In Spark version 2.4 and earlier, SQL queries such as `FROM <table>` or `FROM <table> UNION ALL FROM <table>` are supported by accident. In hive-style `FROM <table> SELECT <expr>`, the `SELECT` clause is not negligible. Neither Hive nor Presto support this syntax. Therefore we will treat these queries as invalid since Spark 3.0. - Since Spark 3.0, the Dataset and DataFrame API `unionAll` is not deprecated any more. It is an alias for `union`. diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 516ee7e..fc0828b 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -297,8 +297,6 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer): # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), self._timezone) - # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 - return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) try: array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck) diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index cb51241..0671137 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -268,10 +268,10 @@ class ArrowTests(ReusedSQLTestCase): def test_createDataFrame_with_incorrect_schema(self): pdf = self.create_pandas_data_frame() fields = list(self.schema) - fields[0], fields[7] = fields[7], fields[0] # swap str with timestamp + fields[0], fields[1] = fields[1], fields[0] # swap str with int wrong_schema = StructType(fields) with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, ".*cast.*[s|S]tring.*timestamp.*"): + with self.assertRaisesRegexp(Exception, "integer.*required.*got.*str"): self.spark.createDataFrame(pdf, schema=wrong_schema) def test_createDataFrame_with_names(self): diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 1c96e33..ca5e85b 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -131,7 +131,7 @@ def require_minimum_pandas_version(): """ Raise ImportError if minimum version of Pandas is not installed """ # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pandas_version = "0.19.2" + minimum_pandas_version = "0.23.2" from distutils.version import LooseVersion try: diff --git a/python/setup.py b/python/setup.py index e769bf5..ee5c326 100644 --- a/python/setup.py +++ b/python/setup.py @@ -105,7 +105,7 @@ if (in_spark): # If you are changing the versions here, please also change ./python/pyspark/sql/utils.py # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. -_minimum_pandas_version = "0.19.2" +_minimum_pandas_version = "0.23.2" _minimum_pyarrow_version = "0.12.1" try: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org