[spark] branch master updated: [SPARK-43261][PYTHON] Migrate `TypeError` from Spark SQL types into error class

ruifengz Thu, 27 Apr 2023 01:57:24 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 04816474bfc [SPARK-43261][PYTHON] Migrate `TypeError` from Spark SQL 
types into error class
04816474bfc is described below

commit 04816474bfcc05c7d90f7b7e8d35184d95c78cbd
Author: itholic <haejoon....@databricks.com>
AuthorDate: Thu Apr 27 16:55:52 2023 +0800

    [SPARK-43261][PYTHON] Migrate `TypeError` from Spark SQL types into error 
class
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to migrate `TypeError` from Spark SQL types into error 
class.
    
    ### Why are the changes needed?
    
    To improve PySpark error
    
    ### Does this PR introduce _any_ user-facing change?
    
    No API change, only error improvement.
    
    ### How was this patch tested?
    
    The existing CI should pass
    
    Closes #40926 from itholic/error_sql_types.
    
    Authored-by: itholic <haejoon....@databricks.com>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 python/pyspark/errors/error_classes.py     | 35 +++++++++++++
 python/pyspark/sql/tests/test_dataframe.py |  2 +-
 python/pyspark/sql/tests/test_functions.py | 12 +++--
 python/pyspark/sql/tests/test_types.py     | 61 +++++++++++++++++++----
 python/pyspark/sql/types.py                | 80 +++++++++++++++++++++++-------
 5 files changed, 158 insertions(+), 32 deletions(-)

diff --git a/python/pyspark/errors/error_classes.py 
b/python/pyspark/errors/error_classes.py
index 34efd471707..f35971c4a94 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -44,6 +44,11 @@ ERROR_CLASSES_JSON = """
       "Not supported to call `<func_name>` before initialize <object>."
     ]
   },
+  "CANNOT_ACCEPT_OBJECT_IN_TYPE": {
+    "message": [
+      "`<data_type>` can not accept object `<obj_name>` in type `<obj_type>`."
+    ]
+  },
   "CANNOT_ACCESS_TO_DUNDER": {
     "message": [
       "Dunder(double underscore) attribute is for internal use only."
@@ -69,11 +74,31 @@ ERROR_CLASSES_JSON = """
       "Cannot convert column into bool: please use '&' for 'and', '|' for 
'or', '~' for 'not' when building DataFrame boolean expressions."
     ]
   },
+  "CANNOT_CONVERT_TYPE": {
+    "message": [
+      "Cannot convert <from_type> into <to_type>."
+    ]
+  },
   "CANNOT_INFER_ARRAY_TYPE": {
     "message": [
       "Can not infer Array Type from an list with None as the first element."
     ]
   },
+  "CANNOT_INFER_SCHEMA_FOR_TYPE": {
+    "message": [
+      "Can not infer schema for type: `<data_type>`."
+    ]
+  },
+  "CANNOT_INFER_TYPE_FOR_FIELD": {
+    "message": [
+      "Unable to infer the type of the field `<field_name>`."
+    ]
+  },
+  "CANNOT_MERGE_TYPE": {
+    "message": [
+      "Can not merge type `<data_type1>` and `<data_type2>`."
+    ]
+  },
   "CANNOT_OPEN_SOCKET": {
     "message": [
       "Can not open socket: <errors>."
@@ -155,6 +180,11 @@ ERROR_CLASSES_JSON = """
       "Timeout timestamp (<timestamp>) cannot be earlier than the current 
watermark (<watermark>)."
     ]
   },
+  "INVALID_TYPENAME_CALL" : {
+    "message" : [
+      "StructField does not have typeName. Use typeName on its type explicitly 
instead."
+    ]
+  },
   "INVALID_UDF_EVAL_TYPE" : {
     "message" : [
       "Eval type for UDF must be <eval_type>."
@@ -335,6 +365,11 @@ ERROR_CLASSES_JSON = """
       "Argument `<arg_name>` should be an int, got <arg_type>."
     ]
   },
+  "NOT_INT_OR_SLICE_OR_STR" : {
+    "message" : [
+      "Argument `<arg_name>` should be an int, slice or str, got <arg_type>."
+    ]
+  },
   "NOT_IN_BARRIER_STAGE" : {
     "message" : [
       "It is not in a barrier stage."
diff --git a/python/pyspark/sql/tests/test_dataframe.py 
b/python/pyspark/sql/tests/test_dataframe.py
index 96b31dfee7b..27e12568b28 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -1010,7 +1010,7 @@ class DataFrameTestsMixin:
         # field types mismatch will cause exception at runtime.
         self.assertRaisesRegex(
             Exception,
-            "FloatType\\(\\) can not accept",
+            "CANNOT_ACCEPT_OBJECT_IN_TYPE",
             lambda: rdd.toDF("key: float, value: string").collect(),
         )
 
diff --git a/python/pyspark/sql/tests/test_functions.py 
b/python/pyspark/sql/tests/test_functions.py
index 38de87b0e72..9067de34633 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -1136,11 +1136,17 @@ class FunctionsTestsMixin:
                 expected_spark_dtypes, 
self.spark.range(1).select(F.lit(arr).alias("b")).dtypes
             )
         arr = np.array([1, 2]).astype(np.uint)
-        with self.assertRaisesRegex(
-            TypeError, "The type of array scalar '%s' is not supported" % 
arr.dtype
-        ):
+        with self.assertRaises(PySparkTypeError) as pe:
             self.spark.range(1).select(F.lit(arr).alias("b"))
 
+        self.check_error(
+            exception=pe.exception,
+            error_class="UNSUPPORTED_NUMPY_ARRAY_SCALAR",
+            message_parameters={
+                "dtype": "uint64",
+            },
+        )
+
     def test_binary_math_function(self):
         funcs, expected = zip(
             *[(F.atan2, 0.13664), (F.hypot, 8.07527), (F.pow, 2.14359), 
(F.pmod, 1.1)]
diff --git a/python/pyspark/sql/tests/test_types.py 
b/python/pyspark/sql/tests/test_types.py
index cd1ae1f2964..49952c2c135 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -26,7 +26,7 @@ import unittest
 
 from pyspark.sql import Row
 from pyspark.sql import functions as F
-from pyspark.errors import AnalysisException
+from pyspark.errors import AnalysisException, PySparkTypeError
 from pyspark.sql.types import (
     ByteType,
     ShortType,
@@ -66,6 +66,7 @@ from pyspark.testing.sqlutils import (
     PythonOnlyPoint,
     MyObject,
 )
+from pyspark.testing.utils import PySparkErrorTestUtils
 
 
 class TypesTestsMixin:
@@ -906,8 +907,13 @@ class TypesTestsMixin:
         self.assertEqual(
             _merge_type(ArrayType(LongType()), ArrayType(LongType())), 
ArrayType(LongType())
         )
-        with self.assertRaisesRegex(TypeError, "element in array"):
+        with self.assertRaises(PySparkTypeError) as pe:
             _merge_type(ArrayType(LongType()), ArrayType(DoubleType()))
+        self.check_error(
+            exception=pe.exception,
+            error_class="CANNOT_MERGE_TYPE",
+            message_parameters={"data_type1": "LongType", "data_type2": 
"DoubleType"},
+        )
 
         self.assertEqual(
             _merge_type(MapType(StringType(), LongType()), 
MapType(StringType(), LongType())),
@@ -919,8 +925,13 @@ class TypesTestsMixin:
             MapType(StringType(), LongType()),
         )
 
-        with self.assertRaisesRegex(TypeError, "value of map"):
+        with self.assertRaises(PySparkTypeError) as pe:
             _merge_type(MapType(StringType(), LongType()), 
MapType(StringType(), DoubleType()))
+        self.check_error(
+            exception=pe.exception,
+            error_class="CANNOT_MERGE_TYPE",
+            message_parameters={"data_type1": "LongType", "data_type2": 
"DoubleType"},
+        )
 
         self.assertEqual(
             _merge_type(
@@ -929,11 +940,16 @@ class TypesTestsMixin:
             ),
             StructType([StructField("f1", LongType()), StructField("f2", 
StringType())]),
         )
-        with self.assertRaisesRegex(TypeError, "field f1"):
+        with self.assertRaises(PySparkTypeError) as pe:
             _merge_type(
                 StructType([StructField("f1", LongType()), StructField("f2", 
StringType())]),
                 StructType([StructField("f1", DoubleType()), StructField("f2", 
StringType())]),
             )
+        self.check_error(
+            exception=pe.exception,
+            error_class="CANNOT_MERGE_TYPE",
+            message_parameters={"data_type1": "LongType", "data_type2": 
"DoubleType"},
+        )
 
         self.assertEqual(
             _merge_type(
@@ -961,7 +977,7 @@ class TypesTestsMixin:
             ),
             StructType([StructField("f1", ArrayType(LongType())), 
StructField("f2", StringType())]),
         )
-        with self.assertRaisesRegex(TypeError, "element in array field f1"):
+        with self.assertRaises(PySparkTypeError) as pe:
             _merge_type(
                 StructType(
                     [StructField("f1", ArrayType(LongType())), 
StructField("f2", StringType())]
@@ -970,6 +986,11 @@ class TypesTestsMixin:
                     [StructField("f1", ArrayType(DoubleType())), 
StructField("f2", StringType())]
                 ),
             )
+        self.check_error(
+            exception=pe.exception,
+            error_class="CANNOT_MERGE_TYPE",
+            message_parameters={"data_type1": "LongType", "data_type2": 
"DoubleType"},
+        )
 
         self.assertEqual(
             _merge_type(
@@ -993,7 +1014,7 @@ class TypesTestsMixin:
                 ]
             ),
         )
-        with self.assertRaisesRegex(TypeError, "value of map field f1"):
+        with self.assertRaises(PySparkTypeError) as pe:
             _merge_type(
                 StructType(
                     [
@@ -1008,6 +1029,11 @@ class TypesTestsMixin:
                     ]
                 ),
             )
+        self.check_error(
+            exception=pe.exception,
+            error_class="CANNOT_MERGE_TYPE",
+            message_parameters={"data_type1": "LongType", "data_type2": 
"DoubleType"},
+        )
 
         self.assertEqual(
             _merge_type(
@@ -1110,10 +1136,16 @@ class TypesTestsMixin:
         unsupported_types = all_types - set(supported_types)
         # test unsupported types
         for t in unsupported_types:
-            with self.assertRaisesRegex(TypeError, "infer the type of the 
field myarray"):
+            with self.assertRaises(PySparkTypeError) as pe:
                 a = array.array(t)
                 self.spark.createDataFrame([Row(myarray=a)]).collect()
 
+            self.check_error(
+                exception=pe.exception,
+                error_class="CANNOT_INFER_TYPE_FOR_FIELD",
+                message_parameters={"field_name": "myarray"},
+            )
+
     def test_repr(self):
         instances = [
             NullType(),
@@ -1304,7 +1336,7 @@ class DataTypeTests(unittest.TestCase):
         self.assertRaises(ValueError, lambda: row_class(1, 2, 3))
 
 
-class DataTypeVerificationTests(unittest.TestCase):
+class DataTypeVerificationTests(unittest.TestCase, PySparkErrorTestUtils):
     def test_verify_type_exception_msg(self):
         self.assertRaisesRegex(
             ValueError,
@@ -1313,8 +1345,17 @@ class DataTypeVerificationTests(unittest.TestCase):
         )
 
         schema = StructType([StructField("a", StructType([StructField("b", 
IntegerType())]))])
-        self.assertRaisesRegex(
-            TypeError, "field b in field a", lambda: 
_make_type_verifier(schema)([["data"]])
+        with self.assertRaises(PySparkTypeError) as pe:
+            _make_type_verifier(schema)([["data"]])
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
+            message_parameters={
+                "data_type": "IntegerType()",
+                "obj_name": "data",
+                "obj_type": "str",
+            },
         )
 
     def test_verify_type_ok_nullable(self):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 721be76e8ba..5876d55e426 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -50,6 +50,7 @@ from py4j.java_gateway import GatewayClient, JavaClass, 
JavaGateway, JavaObject,
 
 from pyspark.serializers import CloudPickleSerializer
 from pyspark.sql.utils import has_numpy, get_active_spark_context
+from pyspark.errors import PySparkTypeError
 
 if has_numpy:
     import numpy as np
@@ -718,8 +719,9 @@ class StructField(DataType):
         return self.dataType.fromInternal(obj)
 
     def typeName(self) -> str:  # type: ignore[override]
-        raise TypeError(
-            "StructField does not have typeName. " "Use typeName on its type 
explicitly instead."
+        raise PySparkTypeError(
+            error_class="INVALID_TYPENAME_CALL",
+            message_parameters={},
         )
 
 
@@ -898,7 +900,10 @@ class StructType(DataType):
         elif isinstance(key, slice):
             return StructType(self.fields[key])
         else:
-            raise TypeError("StructType keys should be strings, integers or 
slices")
+            raise PySparkTypeError(
+                error_class="NOT_INT_OR_SLICE_OR_STR",
+                message_parameters={"arg_name": "key", "arg_type": 
type(key).__name__},
+            )
 
     def simpleString(self) -> str:
         return "struct<%s>" % (",".join(f.simpleString() for f in self))
@@ -1584,7 +1589,10 @@ def _infer_type(
         if obj.typecode in _array_type_mappings:
             return ArrayType(_array_type_mappings[obj.typecode](), False)
         else:
-            raise TypeError("not supported type: array(%s)" % obj.typecode)
+            raise PySparkTypeError(
+                error_class="UNSUPPORTED_DATA_TYPE",
+                message_parameters={"data_type": f"array({obj.typecode})"},
+            )
     else:
         try:
             return _infer_schema(
@@ -1593,7 +1601,10 @@ def _infer_type(
                 infer_array_from_first_element=infer_array_from_first_element,
             )
         except TypeError:
-            raise TypeError("not supported type: %s" % type(obj))
+            raise PySparkTypeError(
+                error_class="UNSUPPORTED_DATA_TYPE",
+                message_parameters={"data_type": type(obj).__name__},
+            )
 
 
 def _infer_schema(
@@ -1624,7 +1635,10 @@ def _infer_schema(
         items = sorted(row.__dict__.items())
 
     else:
-        raise TypeError("Can not infer schema for type: %s" % type(row))
+        raise PySparkTypeError(
+            error_class="CANNOT_INFER_SCHEMA_FOR_TYPE",
+            message_parameters={"data_type": type(row).__name__},
+        )
 
     fields = []
     for k, v in items:
@@ -1641,8 +1655,11 @@ def _infer_schema(
                     True,
                 )
             )
-        except TypeError as e:
-            raise TypeError("Unable to infer the type of the field 
{}.".format(k)) from e
+        except TypeError:
+            raise PySparkTypeError(
+                error_class="CANNOT_INFER_TYPE_FOR_FIELD",
+                message_parameters={"field_name": k},
+            )
     return StructType(fields)
 
 
@@ -1713,7 +1730,10 @@ def _merge_type(
         return a
     elif type(a) is not type(b):
         # TODO: type cast (such as int -> long)
-        raise TypeError(new_msg("Can not merge type %s and %s" % (type(a), 
type(b))))
+        raise PySparkTypeError(
+            error_class="CANNOT_MERGE_TYPE",
+            message_parameters={"data_type1": type(a).__name__, "data_type2": 
type(b).__name__},
+        )
 
     # same type
     if isinstance(a, StructType):
@@ -1801,7 +1821,10 @@ def _create_converter(dataType: DataType) -> Callable:
         elif hasattr(obj, "__dict__"):  # object
             d = obj.__dict__
         else:
-            raise TypeError("Unexpected obj type: %s" % type(obj))
+            raise PySparkTypeError(
+                error_class="UNSUPPORTED_DATA_TYPE",
+                message_parameters={"data_type": type(obj).__name__},
+            )
 
         if convert_fields:
             return tuple([conv(d.get(name)) for name, conv in zip(names, 
converters)])
@@ -1860,7 +1883,7 @@ def _make_type_verifier(
     >>> _make_type_verifier(ArrayType(StringType()))(set()) # doctest: 
+IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
-    TypeError:...
+    pyspark.errors.exceptions.base.PySparkTypeError:...
     >>> _make_type_verifier(MapType(StringType(), IntegerType()))({})
     >>> _make_type_verifier(StructType([]))(())
     >>> _make_type_verifier(StructType([]))([])
@@ -1883,7 +1906,9 @@ def _make_type_verifier(
     Traceback (most recent call last):
         ...
     ValueError:...
-    >>> _make_type_verifier(MapType(StringType(), IntegerType()))({None: 1})
+    >>> _make_type_verifier(  # doctest: +IGNORE_EXCEPTION_DETAIL
+    ...     MapType(StringType(), IntegerType())
+    ...     )({None: 1})
     Traceback (most recent call last):
         ...
     ValueError:...
@@ -1929,8 +1954,13 @@ def _make_type_verifier(
     def verify_acceptable_types(obj: Any) -> None:
         # subclass of them can not be fromInternal in JVM
         if type(obj) not in _acceptable_types[_type]:
-            raise TypeError(
-                new_msg("%s can not accept object %r in type %s" % (dataType, 
obj, type(obj)))
+            raise PySparkTypeError(
+                error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
+                message_parameters={
+                    "data_type": str(dataType),
+                    "obj_name": str(obj),
+                    "obj_type": type(obj).__name__,
+                },
             )
 
     if isinstance(dataType, (StringType, CharType, VarcharType)):
@@ -2043,8 +2073,13 @@ def _make_type_verifier(
                 for f, verifier in verifiers:
                     verifier(d.get(f))
             else:
-                raise TypeError(
-                    new_msg("StructType can not accept object %r in type %s" % 
(obj, type(obj)))
+                raise PySparkTypeError(
+                    error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
+                    message_parameters={
+                        "data_type": "StructType",
+                        "obj_name": str(obj),
+                        "obj_type": type(obj).__name__,
+                    },
                 )
 
         verify_value = verify_struct
@@ -2183,7 +2218,13 @@ class Row(tuple):
         True
         """
         if not hasattr(self, "__fields__"):
-            raise TypeError("Cannot convert a Row class into dict")
+            raise PySparkTypeError(
+                error_class="CANNOT_CONVERT_TYPE",
+                message_parameters={
+                    "from_type": "Row",
+                    "to_type": "dict",
+                },
+            )
 
         if recursive:
 
@@ -2368,7 +2409,10 @@ class NumpyArrayConverter:
         else:
             jtpe = self._from_numpy_type_to_java_type(obj.dtype, gateway)
             if jtpe is None:
-                raise TypeError("The type of array scalar '%s' is not 
supported" % (obj.dtype))
+                raise PySparkTypeError(
+                    error_class="UNSUPPORTED_NUMPY_ARRAY_SCALAR",
+                    message_parameters={"dtype": str(obj.dtype)},
+                )
         jarr = gateway.new_array(jtpe, len(obj))
         for i in range(len(plist)):
             jarr[i] = plist[i]


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-43261][PYTHON] Migrate `TypeError` from Spark SQL types into error class

Reply via email to