This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 593d3d86e50 [SPARK-43209][CONNECT][PYTHON] Migrate Expression errors into error class 593d3d86e50 is described below commit 593d3d86e5071dde4fe2071b4d5f0a4de0f4bbcb Author: itholic <haejoon....@databricks.com> AuthorDate: Mon Apr 24 13:27:40 2023 +0800 [SPARK-43209][CONNECT][PYTHON] Migrate Expression errors into error class ### What changes were proposed in this pull request? This PR proposes to migrate Expression errors into PySpark error framework. ### Why are the changes needed? To leverage the PySpark error framework so that we can provide more actionable and consistent errors for users. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The existing CI should pass. Closes #40869 from itholic/error_connect_expr. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/errors/error_classes.py | 25 +++++++++ python/pyspark/sql/connect/expressions.py | 63 ++++++++++++++++++---- .../sql/tests/connect/test_connect_column.py | 23 +++++--- .../sql/tests/connect/test_connect_function.py | 13 +++-- 4 files changed, 100 insertions(+), 24 deletions(-) diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index d7c1b00d115..4425ed79928 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -54,11 +54,21 @@ ERROR_CLASSES_JSON = """ "Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions." ] }, + "CANNOT_INFER_ARRAY_TYPE": { + "message": [ + "Can not infer Array Type from an list with None as the first element." + ] + }, "CANNOT_PARSE_DATATYPE": { "message": [ "Unable to parse datatype from schema. <error>." ] }, + "CANNOT_PROVIDE_METADATA": { + "message": [ + "metadata can only be provided for a single column." + ] + }, "CANNOT_SET_TOGETHER": { "message": [ "<arg_list> should not be set together." @@ -319,6 +329,16 @@ ERROR_CLASSES_JSON = """ "State is either not defined or has already been removed." ] }, + "UNSUPPORTED_DATA_TYPE" : { + "message" : [ + "Unsupported DataType `<data_type>`." + ] + }, + "UNSUPPORTED_LITERAL" : { + "message" : [ + "Unsupported Literal '<literal>'." + ] + }, "UNSUPPORTED_NUMPY_ARRAY_SCALAR" : { "message" : [ "The type of array scalar '<dtype>' is not supported." @@ -354,6 +374,11 @@ ERROR_CLASSES_JSON = """ "Value for `<arg_name>` must be True, got '<arg_value>'." ] }, + "VALUE_OUT_OF_BOUND" : { + "message" : [ + "Value for `<arg_name>` must be between <min> and <max>." + ] + }, "WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION" : { "message" : [ "Function `<func_name>` should take between 1 and 3 arguments, but provided function takes <num_args>." diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index 8ed365091fc..4fc0147d29b 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -73,6 +73,7 @@ from pyspark.sql.connect.types import ( pyspark_types_to_proto_types, proto_schema_to_pyspark_data_type, ) +from pyspark.errors import PySparkTypeError, PySparkValueError if TYPE_CHECKING: from pyspark.sql.connect.client import SparkConnectClient @@ -160,7 +161,10 @@ class ColumnAlias(Expression): return exp else: if self._metadata: - raise ValueError("metadata can only be provided for a single column") + raise PySparkValueError( + error_class="CANNOT_PROVIDE_METADATA", + message_parameters={}, + ) exp = proto.Expression() exp.alias.name.extend(self._alias) exp.alias.expr.CopyFrom(self._parent.to_plan(session)) @@ -255,7 +259,10 @@ class LiteralExpression(Expression): elif isinstance(dataType, ArrayType): assert isinstance(value, list) else: - raise TypeError(f"Unsupported Data Type {dataType}") + raise PySparkTypeError( + error_class="UNSUPPORTED_DATA_TYPE", + message_parameters={"data_type": str(dataType)}, + ) self._value = value self._dataType = dataType @@ -274,7 +281,14 @@ class LiteralExpression(Expression): elif JVM_LONG_MIN <= value <= JVM_LONG_MAX: return LongType() else: - raise ValueError(f"integer {value} out of bounds") + raise PySparkValueError( + error_class="VALUE_OUT_OF_BOUND", + message_parameters={ + "arg_name": "value", + "min": str(JVM_LONG_MIN), + "max": str(JVM_SHORT_MAX), + }, + ) elif isinstance(value, float): return DoubleType() elif isinstance(value, str): @@ -297,15 +311,22 @@ class LiteralExpression(Expression): # follow the 'infer_array_from_first_element' strategy in 'sql.types._infer_type' # right now, it's dedicated for pyspark.ml params like array<...>, array<array<...>> if len(value) == 0: - raise TypeError("Can not infer Array Type from an empty list") + raise PySparkValueError( + error_class="CANNOT_BE_EMPTY", + message_parameters={"item": "value"}, + ) first = value[0] if first is None: - raise TypeError( - "Can not infer Array Type from an list with None as the first element" + raise PySparkTypeError( + error_class="CANNOT_INFER_ARRAY_TYPE", + message_parameters={}, ) return ArrayType(LiteralExpression._infer_type(first), True) - raise TypeError(f"Unsupported Data Type {type(value).__name__}") + raise PySparkTypeError( + error_class="UNSUPPORTED_DATA_TYPE", + message_parameters={"data_type": type(value).__name__}, + ) @classmethod def _from_value(cls, value: Any) -> "LiteralExpression": @@ -366,7 +387,10 @@ class LiteralExpression(Expression): assert elementType == dataType.elementType return [LiteralExpression._to_value(v, elementType) for v in literal.array.elements] - raise TypeError(f"Unsupported Literal Value {literal}") + raise PySparkTypeError( + error_class="UNSUPPORTED_LITERAL", + message_parameters={"literal": str(literal)}, + ) def to_plan(self, session: "SparkConnectClient") -> "proto.Expression": """Converts the literal expression to the literal in proto.""" @@ -413,7 +437,10 @@ class LiteralExpression(Expression): LiteralExpression(v, element_type).to_plan(session).literal ) else: - raise ValueError(f"Unsupported Data Type {self._dataType}") + raise PySparkTypeError( + error_class="UNSUPPORTED_DATA_TYPE", + message_parameters={"data_type": str(self._dataType)}, + ) return expr @@ -940,7 +967,14 @@ class WindowExpression(Expression): elif JVM_INT_MIN <= start <= JVM_INT_MAX: expr.window.frame_spec.lower.value.literal.integer = start else: - raise ValueError(f"start is out of bound: {start}") + raise PySparkValueError( + error_class="VALUE_OUT_OF_BOUND", + message_parameters={ + "arg_name": "start", + "min": str(JVM_INT_MIN), + "max": str(JVM_INT_MAX), + }, + ) end = self._windowSpec._frame._end if end == 0: @@ -950,7 +984,14 @@ class WindowExpression(Expression): elif JVM_INT_MIN <= end <= JVM_INT_MAX: expr.window.frame_spec.upper.value.literal.integer = end else: - raise ValueError(f"end is out of bound: {end}") + raise PySparkValueError( + error_class="VALUE_OUT_OF_BOUND", + message_parameters={ + "arg_name": "end", + "min": str(JVM_INT_MIN), + "max": str(JVM_INT_MAX), + }, + ) else: expr.window.frame_spec.frame_type = ( diff --git a/python/pyspark/sql/tests/connect/test_connect_column.py b/python/pyspark/sql/tests/connect/test_connect_column.py index 5703f8d2a3c..a62f4dcfebf 100644 --- a/python/pyspark/sql/tests/connect/test_connect_column.py +++ b/python/pyspark/sql/tests/connect/test_connect_column.py @@ -513,18 +513,25 @@ class SparkConnectColumnTests(SparkConnectSQLTestCase): self.assertEqual(cdf1.schema, sdf1.schema) self.assert_eq(cdf1.toPandas(), sdf1.toPandas()) - with self.assertRaisesRegex( - ValueError, - "integer 9223372036854775808 out of bounds", - ): + # negative test for incorrect type + with self.assertRaises(PySparkValueError) as pe: cdf.select(CF.lit(JVM_LONG_MAX + 1)).show() - with self.assertRaisesRegex( - ValueError, - "integer -9223372036854775809 out of bounds", - ): + self.check_error( + exception=pe.exception, + error_class="VALUE_OUT_OF_BOUND", + message_parameters={"arg_name": "value", "min": "-9223372036854775808", "max": "32767"}, + ) + + with self.assertRaises(PySparkValueError) as pe: cdf.select(CF.lit(JVM_LONG_MIN - 1)).show() + self.check_error( + exception=pe.exception, + error_class="VALUE_OUT_OF_BOUND", + message_parameters={"arg_name": "value", "min": "-9223372036854775808", "max": "32767"}, + ) + def test_cast(self): # SPARK-41412: test basic Column.cast df = self.connect.read.table(self.tbl_name) diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index 57b39310fe8..38a7ed4df62 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -17,7 +17,7 @@ import os import unittest -from pyspark.errors import PySparkTypeError +from pyspark.errors import PySparkTypeError, PySparkValueError from pyspark.sql import SparkSession as PySparkSession from pyspark.sql.types import StringType, StructType, StructField, ArrayType, IntegerType from pyspark.testing.pandasutils import PandasOnSparkTestUtils @@ -862,12 +862,15 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, PandasOnSparkTestUtils, S ) # check error - with self.assertRaisesRegex( - ValueError, - "end is out of bound", - ): + with self.assertRaises(PySparkValueError) as pe: cdf.select(CF.sum("a").over(CW.orderBy("b").rowsBetween(0, (1 << 33)))).show() + self.check_error( + exception=pe.exception, + error_class="VALUE_OUT_OF_BOUND", + message_parameters={"arg_name": "end", "min": "-2147483648", "max": "2147483647"}, + ) + with self.assertRaises(PySparkTypeError) as pe: cdf.select(CF.rank().over(cdf.a)) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org