This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 470aaf32a43e [SPARK-43664][CONNECT][PS] Raise exception for `ps.sql` with Pandas-on-Spark object on Spark Connect 470aaf32a43e is described below commit 470aaf32a43e3f778e28050df3b81ffd16cd7ff2 Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Thu Oct 12 19:56:53 2023 +0900 [SPARK-43664][CONNECT][PS] Raise exception for `ps.sql` with Pandas-on-Spark object on Spark Connect ### What changes were proposed in this pull request? This PR proposes to raise proper exception for `ps.sql` with Pandas-on-Spark DataFrame on Spark Connect ### Why are the changes needed? To improve error message ### Does this PR introduce _any_ user-facing change? No API change, but it's error message improvement. **Before** ```python >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> ps.sql("SELECT {col}, {col2} FROM {tbl}", col=psdf.A, col2=psdf.B, tbl=psdf) Traceback (most recent call last): ... pyspark.errors.exceptions.connect.AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `_pandas_api_32aa6c7b33ac442bab790cfb49f65ca1` cannot be found. Verify the spelling and correctness of the schema and catalog. If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog. To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 17; 'Project ['A, 'B] +- 'UnresolvedRelation [_pandas_api_32aa6c7b33ac442bab790cfb49f65ca1], [], false JVM stacktrace: org.apache.spark.sql.catalyst.ExtendedAnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `_pandas_api_32aa6c7b33ac442bab790cfb49f65ca1` cannot be found. Verify the spelling and correctness of the schema and catalog. ... ``` **After** ```python >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> ps.sql("SELECT {col}, {col2} FROM {tbl}", col=psdf.A, col2=psdf.B, tbl=psdf) Traceback (most recent call last): ... pyspark.errors.exceptions.base.PySparkTypeError: [UNSUPPORTED_DATA_TYPE] Unsupported DataType `DataFrame`. ``` ### How was this patch tested? The existing CI should pass ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43237 from itholic/SPARK-43664. Lead-authored-by: Haejoon Lee <haejoon....@databricks.com> Co-authored-by: Haejoon Lee <44108233+itho...@users.noreply.github.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/sql_formatter.py | 13 +++++++++++++ python/pyspark/pandas/tests/connect/test_parity_sql.py | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/sql_formatter.py b/python/pyspark/pandas/sql_formatter.py index 91c4f0b7d77b..9800037016c5 100644 --- a/python/pyspark/pandas/sql_formatter.py +++ b/python/pyspark/pandas/sql_formatter.py @@ -30,6 +30,8 @@ from pyspark.sql import SparkSession from pyspark.pandas.utils import default_session from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import Series +from pyspark.errors import PySparkTypeError +from pyspark.sql.utils import is_remote __all__ = ["sql"] @@ -59,6 +61,9 @@ def sql( Also the method can bind named parameters to SQL literals from `args`. + .. note:: + pandas-on-Spark DataFrame is not supported for Spark Connect. + Parameters ---------- query : str @@ -198,6 +203,14 @@ def sql( session = default_session() formatter = PandasSQLStringFormatter(session) try: + # ps.DataFrame are not supported for Spark Connect currently. + if is_remote(): + for obj in kwargs.values(): + if isinstance(obj, ps.DataFrame): + raise PySparkTypeError( + error_class="UNSUPPORTED_DATA_TYPE", + message_parameters={"data_type": type(obj).__name__}, + ) sdf = session.sql(formatter.format(query, **kwargs), args) finally: formatter.clear() diff --git a/python/pyspark/pandas/tests/connect/test_parity_sql.py b/python/pyspark/pandas/tests/connect/test_parity_sql.py index c042de6b9007..2e503cac07a8 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_sql.py +++ b/python/pyspark/pandas/tests/connect/test_parity_sql.py @@ -22,11 +22,11 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class SQLParityTests(SQLTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): - @unittest.skip("TODO(SPARK-43664): Fix TABLE_OR_VIEW_NOT_FOUND from SQLParityTests.") + @unittest.skip("Test depends on temp view issue on JVM side.") def test_sql_with_index_col(self): super().test_sql_with_index_col() - @unittest.skip("TODO(SPARK-43664): Fix TABLE_OR_VIEW_NOT_FOUND from SQLParityTests.") + @unittest.skip("Test depends on temp view issue on JVM side.") def test_sql_with_pandas_on_spark_objects(self): super().test_sql_with_pandas_on_spark_objects() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org