This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new ca75340d607 [SPARK-42824][CONNECT][PYTHON] Provide a clear error message for unsupported JVM attributes ca75340d607 is described below commit ca75340d607f13932c9a49081ae9effcee5ac3f7 Author: itholic <haejoon....@databricks.com> AuthorDate: Fri Mar 17 11:13:00 2023 +0900 [SPARK-42824][CONNECT][PYTHON] Provide a clear error message for unsupported JVM attributes ### What changes were proposed in this pull request? This pull request proposes an improvement to the error message when trying to access a JVM attribute that is not supported in Spark Connect. Specifically, it adds a more informative error message that clearly indicates which attribute is not supported due to Spark Connect's lack of dependency on the JVM. ### Why are the changes needed? Currently, when attempting to access an unsupported JVM attribute in Spark Connect, the error message is not very clear, making it difficult for users to understand the root cause of the issue. This improvement aims to provide more helpful information to users to address this problem as below: **Before** ```python >>> spark._jsc Traceback (most recent call last): File "<stdin>", line 1, in <module> AttributeError: 'SparkSession' object has no attribute '_jsc' ``` **After** ```python >>> spark._jsc Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Users/haejoon.lee/Desktop/git_store/spark/python/pyspark/sql/connect/session.py", line 490, in _jsc raise PySparkAttributeError( pyspark.errors.exceptions.base.PySparkAttributeError: [JVM_ATTRIBUTE_NOT_SUPPORTED] Attribute `_jsc` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, use the original PySpark instead of Spark Connect. ``` ### Does this PR introduce _any_ user-facing change? This PR does not introduce any user-facing change in terms of functionality. However, it improves the error message, which could potentially affect the user experience in a positive way. ### How was this patch tested? This patch was tested by adding new unit tests that specifically target the error message related to unsupported JVM attributes. The tests were run locally on a development environment. Closes #40458 from itholic/SPARK-42824. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit deac481304489f9b8ecd24ec6f3aed1e0c0d75eb) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/errors/__init__.py | 2 + python/pyspark/errors/error_classes.py | 5 +++ python/pyspark/errors/exceptions/base.py | 6 +++ python/pyspark/sql/connect/column.py | 12 +++++- python/pyspark/sql/connect/dataframe.py | 6 ++- python/pyspark/sql/connect/readwriter.py | 7 ++++ python/pyspark/sql/connect/session.py | 26 ++++++++++++ .../sql/tests/connect/test_connect_basic.py | 49 +++++++++++++++++++++- 8 files changed, 110 insertions(+), 3 deletions(-) diff --git a/python/pyspark/errors/__init__.py b/python/pyspark/errors/__init__.py index 95da7ca2aa8..94117fc5160 100644 --- a/python/pyspark/errors/__init__.py +++ b/python/pyspark/errors/__init__.py @@ -31,6 +31,7 @@ from pyspark.errors.exceptions.base import ( # noqa: F401 SparkUpgradeException, PySparkTypeError, PySparkValueError, + PySparkAttributeError, ) @@ -47,4 +48,5 @@ __all__ = [ "SparkUpgradeException", "PySparkTypeError", "PySparkValueError", + "PySparkAttributeError", ] diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index 8c0f79f7d5a..dda1f5a1f84 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -39,6 +39,11 @@ ERROR_CLASSES_JSON = """ "Function `<func_name>` should return Column, got <return_type>." ] }, + "JVM_ATTRIBUTE_NOT_SUPPORTED" : { + "message" : [ + "Attribute `<attr_name>` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, do not use Spark Connect when creating your session." + ] + }, "NOT_BOOL" : { "message" : [ "Argument `<arg_name>` should be a bool, got <arg_type>." diff --git a/python/pyspark/errors/exceptions/base.py b/python/pyspark/errors/exceptions/base.py index 6e67039374d..fa66b80ac3a 100644 --- a/python/pyspark/errors/exceptions/base.py +++ b/python/pyspark/errors/exceptions/base.py @@ -160,3 +160,9 @@ class PySparkTypeError(PySparkException, TypeError): """ Wrapper class for TypeError to support error classes. """ + + +class PySparkAttributeError(PySparkException, AttributeError): + """ + Wrapper class for AttributeError to support error classes. + """ diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py index d2be32b905e..f30a5f258f2 100644 --- a/python/pyspark/sql/connect/column.py +++ b/python/pyspark/sql/connect/column.py @@ -31,7 +31,7 @@ from typing import ( Optional, ) -from pyspark.errors import PySparkTypeError +from pyspark.errors import PySparkTypeError, PySparkAttributeError from pyspark.sql.types import DataType from pyspark.sql.column import Column as PySparkColumn @@ -433,6 +433,10 @@ class Column: dropFields.__doc__ = PySparkColumn.dropFields.__doc__ def __getattr__(self, item: Any) -> "Column": + if item == "_jc": + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jc"} + ) if item.startswith("__"): raise AttributeError(item) return self[item] @@ -459,6 +463,12 @@ class Column: __bool__ = __nonzero__ + @property + def _jc(self) -> None: + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jc"} + ) + Column.__doc__ = PySparkColumn.__doc__ diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 0887294ddcf..f1968bc0ad9 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -50,7 +50,7 @@ from pyspark.sql.dataframe import ( DataFrameStatFunctions as PySparkDataFrameStatFunctions, ) -from pyspark.errors import PySparkTypeError +from pyspark.errors import PySparkTypeError, PySparkAttributeError from pyspark.errors.exceptions.connect import SparkConnectException from pyspark.rdd import PythonEvalType import pyspark.sql.connect.plan as plan @@ -1304,6 +1304,10 @@ class DataFrame: return None def __getattr__(self, name: str) -> "Column": + if name in ["_jseq", "_jdf", "_jmap", "_jcols"]: + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name} + ) return self[name] @overload diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py index 1b58c54b38e..192ec68b92a 100644 --- a/python/pyspark/sql/connect/readwriter.py +++ b/python/pyspark/sql/connect/readwriter.py @@ -30,6 +30,7 @@ from pyspark.sql.readwriter import ( DataFrameReader as PySparkDataFrameReader, DataFrameWriterV2 as PySparkDataFrameWriterV2, ) +from pyspark.errors import PySparkAttributeError if TYPE_CHECKING: from pyspark.sql.connect.dataframe import DataFrame @@ -417,6 +418,12 @@ class DataFrameReader(OptionUtils): jdbc.__doc__ = PySparkDataFrameReader.jdbc.__doc__ + @property + def _jreader(self) -> None: + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jreader"} + ) + DataFrameReader.__doc__ = PySparkDataFrameReader.__doc__ diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index 5e7c8361d80..b75cb63c4de 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -67,6 +67,7 @@ from pyspark.sql.types import ( TimestampType, ) from pyspark.sql.utils import to_str +from pyspark.errors import PySparkAttributeError if TYPE_CHECKING: from pyspark.sql.connect._typing import OptionalPrimitiveType @@ -473,6 +474,31 @@ class SparkSession: def readStream(self) -> Any: raise NotImplementedError("readStream() is not implemented.") + @property + def _jsc(self) -> None: + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jsc"} + ) + + @property + def _jconf(self) -> None: + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jconf"} + ) + + @property + def _jvm(self) -> None: + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jvm"} + ) + + @property + def _jsparkSession(self) -> None: + raise PySparkAttributeError( + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", + message_parameters={"attr_name": "_jsparkSession"}, + ) + @property def udf(self) -> "UDFRegistration": from pyspark.sql.connect.udf import UDFRegistration diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 9da3285d07e..a8e161a42a6 100644 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -23,7 +23,7 @@ import shutil import tempfile from collections import defaultdict -from pyspark.errors import PySparkTypeError +from pyspark.errors import PySparkAttributeError, PySparkTypeError from pyspark.sql import SparkSession as PySparkSession, Row from pyspark.sql.types import ( StructType, @@ -2936,6 +2936,53 @@ class SparkConnectBasicTests(SparkConnectSQLTestCase): self.assertEqual(cdf2.schema, sdf2.schema) self.assertEqual(cdf2.collect(), sdf2.collect()) + def test_unsupported_jvm_attribute(self): + # Unsupported jvm attributes for Spark session. + unsupported_attrs = ["_jsc", "_jconf", "_jvm", "_jsparkSession"] + spark_session = self.connect + for attr in unsupported_attrs: + with self.assertRaises(PySparkAttributeError) as pe: + getattr(spark_session, attr) + + self.check_error( + exception=pe.exception, + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", + message_parameters={"attr_name": attr}, + ) + + # Unsupported jvm attributes for DataFrame. + unsupported_attrs = ["_jseq", "_jdf", "_jmap", "_jcols"] + cdf = self.connect.range(10) + for attr in unsupported_attrs: + with self.assertRaises(PySparkAttributeError) as pe: + getattr(cdf, attr) + + self.check_error( + exception=pe.exception, + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", + message_parameters={"attr_name": attr}, + ) + + # Unsupported jvm attributes for Column. + with self.assertRaises(PySparkAttributeError) as pe: + getattr(cdf.id, "_jc") + + self.check_error( + exception=pe.exception, + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", + message_parameters={"attr_name": "_jc"}, + ) + + # Unsupported jvm attributes for DataFrameReader. + with self.assertRaises(PySparkAttributeError) as pe: + getattr(spark_session.read, "_jreader") + + self.check_error( + exception=pe.exception, + error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", + message_parameters={"attr_name": "_jreader"}, + ) + @unittest.skipIf(not should_test_connect, connect_requirement_message) class ClientTests(unittest.TestCase): --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org