This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 2076abc Revert "[SPARK-33073][PYTHON] Improve error handling on Pandas to Arrow conversion failures" 2076abc is described below commit 2076abc0d3f1055254d65697c21f26727a502391 Author: HyukjinKwon <gurwls...@apache.org> AuthorDate: Wed Oct 7 09:11:42 2020 +0900 Revert "[SPARK-33073][PYTHON] Improve error handling on Pandas to Arrow conversion failures" This reverts commit 4f71231af51a3da5d7964a218a878c0cf3037c10. --- python/pyspark/sql/pandas/serializers.py | 17 +++++++---------- python/pyspark/sql/tests/test_arrow.py | 9 ++++----- python/pyspark/sql/tests/test_pandas_grouped_map.py | 15 +++++++-------- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index b164a38..4dd15d1 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -156,16 +156,13 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer): s = _check_series_convert_timestamps_internal(s, self._timezone) try: array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck) - except ValueError as e: - if self._safecheck: - error_msg = "Exception thrown when converting pandas.Series (%s) to " + \ - "Arrow Array (%s). It can be caused by overflows or other " + \ - "unsafe conversions warned by Arrow. Arrow safe type check " + \ - "can be disabled by using SQL config " + \ - "`spark.sql.execution.pandas.convertToArrowArraySafely`." - raise ValueError(error_msg % (s.dtype, t)) from e - else: - raise e + except pa.ArrowException as e: + error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \ + "Array (%s). It can be caused by overflows or other unsafe " + \ + "conversions warned by Arrow. Arrow safe type check can be " + \ + "disabled by using SQL config " + \ + "`spark.sql.execution.pandas.convertToArrowArraySafely`." + raise RuntimeError(error_msg % (s.dtype, t), e) return array arrs = [] diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index 2c6231d..15c5cf1 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -266,12 +266,11 @@ class ArrowTests(ReusedSQLTestCase): def test_createDataFrame_with_incorrect_schema(self): pdf = self.create_pandas_data_frame() fields = list(self.schema) - fields[5], fields[6] = fields[6], fields[5] # swap decimal with date + fields[0], fields[1] = fields[1], fields[0] # swap str with int wrong_schema = StructType(fields) - with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}): - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"): - self.spark.createDataFrame(pdf, schema=wrong_schema) + with QuietTest(self.sc): + with self.assertRaisesRegexp(Exception, "integer.*required"): + self.spark.createDataFrame(pdf, schema=wrong_schema) def test_createDataFrame_with_names(self): pdf = self.create_pandas_data_frame() diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index 8e02b29..cc6167e 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -450,16 +450,15 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): def column_name_typo(pdf): return pd.DataFrame({'iid': pdf.id, 'v': pdf.v}) - @pandas_udf('id long, v decimal', PandasUDFType.GROUPED_MAP) + @pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP) def invalid_positional_types(pdf): - return pd.DataFrame([(1, datetime.date(2020, 10, 5))]) + return pd.DataFrame([(u'a', 1.2)]) - with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}): - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "KeyError: 'id'"): - grouped_df.apply(column_name_typo).collect() - with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"): - grouped_df.apply(invalid_positional_types).collect() + with QuietTest(self.sc): + with self.assertRaisesRegexp(Exception, "KeyError: 'id'"): + grouped_df.apply(column_name_typo).collect() + with self.assertRaisesRegexp(Exception, "an integer is required"): + grouped_df.apply(invalid_positional_types).collect() def test_positional_assignment_conf(self): with self.sql_conf({ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org