[spark] branch master updated: [SPARK-44845][YARN][DEPLOY] Fix file system uri comparison function
This is an automated email from the ASF dual-hosted git repository. mridulm80 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 622bbf2e292 [SPARK-44845][YARN][DEPLOY] Fix file system uri comparison function 622bbf2e292 is described below commit 622bbf2e29262c34021cb38c4c70f8eed258999b Author: zekai-li <58294989+zekai...@users.noreply.github.com> AuthorDate: Wed Sep 6 23:15:53 2023 -0500 [SPARK-44845][YARN][DEPLOY] Fix file system uri comparison function ### What changes were proposed in this pull request? What changes were proposed in this pull request? ### Why are the changes needed? In the org.apache.spark.deploy.yarn.Client#compareUri method, hdfs://hadoop81:8020 and hdfs://192.168.0.81:8020 are regarded as different file systems (hadoop81 corresponds to 192.168.0.81). The specific reason is that in the last pr, different URIs of user information are also regarded as different file systems. Uri.getauthority is used to determine the user information, but authority contains the host so the URI above must be different from authority. To determine whether the user a [...] the last pr and issue link: https://issues.apache.org/jira/browse/SPARK-22587 https://github.com/apache/spark/pull/19885 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Closes #42529 from zekai-li/master. Authored-by: zekai-li <58294989+zekai...@users.noreply.github.com> Signed-off-by: Mridul Muralidharan gmail.com> --- .../yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 7 --- .../src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 8257a08fd14..a675054b447 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -1618,9 +1618,10 @@ private[spark] object Client extends Logging { return false } -val srcAuthority = srcUri.getAuthority() -val dstAuthority = dstUri.getAuthority() -if (srcAuthority != null && !srcAuthority.equalsIgnoreCase(dstAuthority)) { +val srcUserInfo = Option(srcUri.getUserInfo).getOrElse("") +val dstUserInfo = Option(dstUri.getUserInfo).getOrElse("") + +if (!srcUserInfo.equals(dstUserInfo)) { return false } diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala index b7fb409ebc3..8802c59e78b 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala @@ -675,7 +675,8 @@ class ClientSuite extends SparkFunSuite ("files URI match test2", "file:///c:file1", "file://c:file2"), ("files URI match test3", "file://host/file1", "file://host/file2"), ("wasb URI match test", "wasb://bucket1@user", "wasb://bucket1@user/"), -("hdfs URI match test", "hdfs:/path1", "hdfs:/path1") +("hdfs URI match test1", "hdfs:/path1", "hdfs:/path1"), +("hdfs URI match test2", "hdfs://localhost:8080", "hdfs://127.0.0.1:8080") ) matching.foreach { t => @@ -691,7 +692,7 @@ class ClientSuite extends SparkFunSuite ("files URI unmatch test3", "file://host/file1", "file://host2/file2"), ("wasb URI unmatch test1", "wasb://bucket1@user", "wasb://bucket2@user/"), ("wasb URI unmatch test2", "wasb://bucket1@user", "wasb://bucket1@user2/"), -("s3 URI unmatch test", "s3a://user@pass:bucket1/", "s3a://user2@pass2:bucket1/"), +("s3 URI unmatch test", "s3a://user:pass@bucket1/", "s3a://user2:pass2@bucket1/"), ("hdfs URI unmatch test1", "hdfs://namenode1/path1", "hdfs://namenode1:8080/path2"), ("hdfs URI unmatch test2", "hdfs://namenode1:8020/path1", "hdfs://namenode1:8080/path2") ) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-45091][PYTHON][CONNECT][SQL] Function `floor/round/bround` accept Column type `scale`
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 56393b90723 [SPARK-45091][PYTHON][CONNECT][SQL] Function `floor/round/bround` accept Column type `scale` 56393b90723 is described below commit 56393b90723257a757b7b87fb623847ef03d4bf3 Author: Ruifeng Zheng AuthorDate: Thu Sep 7 10:44:25 2023 +0800 [SPARK-45091][PYTHON][CONNECT][SQL] Function `floor/round/bround` accept Column type `scale` ### What changes were proposed in this pull request? 1, `floor`: add missing parameter `scale` in Python, which already existed in Scala for a long time; 2, `round/bround`: parameter `scale` support Column type, to be consistent with `floor/ceil/ceiling` ### Why are the changes needed? to make related functions consistent ### Does this PR introduce _any_ user-facing change? yes ### How was this patch tested? added doctest ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42833 from zhengruifeng/py_func_floor. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- .../scala/org/apache/spark/sql/functions.scala | 18 + python/pyspark/sql/connect/functions.py| 24 -- python/pyspark/sql/functions.py| 89 ++ .../scala/org/apache/spark/sql/functions.scala | 22 ++ 4 files changed, 131 insertions(+), 22 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 54bf0106956..bf536c349cb 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -2845,6 +2845,15 @@ object functions { */ def round(e: Column, scale: Int): Column = Column.fn("round", e, lit(scale)) + /** + * Round the value of `e` to `scale` decimal places with HALF_UP round mode if `scale` is + * greater than or equal to 0 or at integral part when `scale` is less than 0. + * + * @group math_funcs + * @since 4.0.0 + */ + def round(e: Column, scale: Column): Column = Column.fn("round", e, scale) + /** * Returns the value of the column `e` rounded to 0 decimal places with HALF_EVEN round mode. * @@ -2862,6 +2871,15 @@ object functions { */ def bround(e: Column, scale: Int): Column = Column.fn("bround", e, lit(scale)) + /** + * Round the value of `e` to `scale` decimal places with HALF_EVEN round mode if `scale` is + * greater than or equal to 0 or at integral part when `scale` is less than 0. + * + * @group math_funcs + * @since 4.0.0 + */ + def bround(e: Column, scale: Column): Column = Column.fn("bround", e, scale) + /** * @param e * angle in radians diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py index cc03a3a3578..892ad6e6295 100644 --- a/python/pyspark/sql/connect/functions.py +++ b/python/pyspark/sql/connect/functions.py @@ -538,8 +538,12 @@ def bin(col: "ColumnOrName") -> Column: bin.__doc__ = pysparkfuncs.bin.__doc__ -def bround(col: "ColumnOrName", scale: int = 0) -> Column: -return _invoke_function("bround", _to_col(col), lit(scale)) +def bround(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: +if scale is None: +return _invoke_function_over_columns("bround", col) +else: +scale = lit(scale) if isinstance(scale, int) else scale +return _invoke_function_over_columns("bround", col, scale) bround.__doc__ = pysparkfuncs.bround.__doc__ @@ -644,8 +648,12 @@ def factorial(col: "ColumnOrName") -> Column: factorial.__doc__ = pysparkfuncs.factorial.__doc__ -def floor(col: "ColumnOrName") -> Column: -return _invoke_function_over_columns("floor", col) +def floor(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: +if scale is None: +return _invoke_function_over_columns("floor", col) +else: +scale = lit(scale) if isinstance(scale, int) else scale +return _invoke_function_over_columns("floor", col, scale) floor.__doc__ = pysparkfuncs.floor.__doc__ @@ -773,8 +781,12 @@ def rint(col: "ColumnOrName") -> Column: rint.__doc__ = pysparkfuncs.rint.__doc__ -def round(col: "ColumnOrName", scale: int = 0) -> Column: -return _invoke_function("round", _to_col(col), lit(scale)) +def round(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: +if scale is None: +return _invoke_function_over_columns("round", col) +else: +scale = lit(scale) if isinstance(scale, int) else scale +return _invoke
[spark] branch master updated: [SPARK-45090][PYTHON][CONNECT] DataFrame.{cube, rollup}` support column ordinals
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 66f89f35f0e [SPARK-45090][PYTHON][CONNECT] DataFrame.{cube, rollup}` support column ordinals 66f89f35f0e is described below commit 66f89f35f0eadbb97de3654286af4b76870a12b9 Author: Ruifeng Zheng AuthorDate: Thu Sep 7 10:04:21 2023 +0800 [SPARK-45090][PYTHON][CONNECT] DataFrame.{cube, rollup}` support column ordinals ### What changes were proposed in this pull request? `DataFrame.{cube, rollup}` support column ordinals ### Why are the changes needed? for feature parity: ``` In [10]: df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"]) In [11]: df.createOrReplaceTempView("v") In [12]: spark.sql("SELECT name, age, COUNT(1) FROM v GROUP BY CUBE(1, 2) ORDER BY 1, 2").show() +-+++ | name| age|count(1)| +-+++ | NULL|NULL| 2| | NULL| 2| 1| | NULL| 5| 1| |Alice|NULL| 1| |Alice| 2| 1| | Bob|NULL| 1| | Bob| 5| 1| +-+++ In [13]: df.select("name", "age").cube(1, 2).agg(sf.count(sf.lit(1))).orderBy(1, 2).show() +-+++ | name| age|count(1)| +-+++ | NULL|NULL| 2| | NULL| 2| 1| | NULL| 5| 1| |Alice|NULL| 1| |Alice| 2| 1| | Bob|NULL| 1| | Bob| 5| 1| +-+++ ``` ### Does this PR introduce _any_ user-facing change? yes ### How was this patch tested? added doctest ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42832 from zhengruifeng/py_cube_rollup_ordinals. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- python/pyspark/sql/connect/dataframe.py | 12 + python/pyspark/sql/dataframe.py | 46 ++--- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index c443023ce02..639a60a0fb3 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -512,6 +512,12 @@ class DataFrame: _cols.append(c) elif isinstance(c, str): _cols.append(self[c]) +elif isinstance(c, int) and not isinstance(c, bool): +# TODO: should introduce dedicated error class +if c < 1: +raise IndexError(f"Column ordinal must be positive but got {c}") +# ordinal is 1-based +_cols.append(self[c - 1]) else: raise PySparkTypeError( error_class="NOT_COLUMN_OR_STR", @@ -529,6 +535,12 @@ class DataFrame: _cols.append(c) elif isinstance(c, str): _cols.append(self[c]) +elif isinstance(c, int) and not isinstance(c, bool): +# TODO: should introduce dedicated error class +if c < 1: +raise IndexError(f"Column ordinal must be positive but got {c}") +# ordinal is 1-based +_cols.append(self[c - 1]) else: raise PySparkTypeError( error_class="NOT_COLUMN_OR_STR", diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index f59ae40542b..f00c8c5ab42 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -3924,7 +3924,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): def rollup(self, __cols: Union[List[Column], List[str]]) -> "GroupedData": ... -def rollup(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] +def rollup(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": # type: ignore[misc] """ Create a multi-dimensional rollup for the current :class:`DataFrame` using the specified columns, so we can run aggregation on them. @@ -3934,6 +3934,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): .. versionchanged:: 3.4.0 Supports Spark Connect. +.. versionchanged:: 4.0.0 +Supports column ordinal. + Parameters -- cols : list, str or :class:`Column` @@ -3946,6 +3949,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :class:`GroupedData` Rolled-up data by given columns. +Notes +- +A column ordinal starts from 1, which is different from the +0-based :meth:`__getitem__`. + Examples
[spark] branch branch-3.5 updated: [SPARK-44835][CONNECT] Make INVALID_CURSOR.DISCONNECTED a retriable error
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 3ceec3b9c95 [SPARK-44835][CONNECT] Make INVALID_CURSOR.DISCONNECTED a retriable error 3ceec3b9c95 is described below commit 3ceec3b9c9502ba8ed5d83b45a3e33ab814409bb Author: Juliusz Sompolski AuthorDate: Thu Sep 7 10:48:28 2023 +0900 [SPARK-44835][CONNECT] Make INVALID_CURSOR.DISCONNECTED a retriable error ### What changes were proposed in this pull request? Make INVALID_CURSOR.DISCONNECTED a retriable error. ### Why are the changes needed? This error can happen if two RPCs are racing to reattach to the query, and the client is still using the losing one. SPARK-44833 was a bug that exposed such a situation. That was fixed, but to be more robust, we can make this error retryable. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tests will be added in https://github.com/apache/spark/pull/42560 ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42818 from juliuszsompolski/SPARK-44835. Authored-by: Juliusz Sompolski Signed-off-by: Hyukjin Kwon (cherry picked from commit f13743de04e430e59c4eaeca464447608bd32b1d) Signed-off-by: Hyukjin Kwon --- .../sql/connect/client/GrpcRetryHandler.scala | 17 +++- python/pyspark/sql/connect/client/core.py | 31 +++--- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala index a6841e7f118..8791530607c 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala @@ -217,7 +217,22 @@ private[sql] object GrpcRetryHandler extends Logging { */ private[client] def retryException(e: Throwable): Boolean = { e match { - case e: StatusRuntimeException => e.getStatus.getCode == Status.Code.UNAVAILABLE + case e: StatusRuntimeException => +val statusCode: Status.Code = e.getStatus.getCode + +if (statusCode == Status.Code.INTERNAL) { + val msg: String = e.toString + + // This error happens if another RPC preempts this RPC. + if (msg.contains("INVALID_CURSOR.DISCONNECTED")) { +return true + } +} + +if (statusCode == Status.Code.UNAVAILABLE) { + return true +} +false case _ => false } } diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 4b8a2348adc..7b3299d123b 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -585,11 +585,36 @@ class SparkConnectClient(object): @classmethod def retry_exception(cls, e: Exception) -> bool: -if isinstance(e, grpc.RpcError): -return e.code() == grpc.StatusCode.UNAVAILABLE -else: +""" +Helper function that is used to identify if an exception thrown by the server +can be retried or not. + +Parameters +-- +e : Exception +The GRPC error as received from the server. Typed as Exception, because other exception +thrown during client processing can be passed here as well. + +Returns +--- +True if the exception can be retried, False otherwise. + +""" +if not isinstance(e, grpc.RpcError): return False +if e.code() in [grpc.StatusCode.INTERNAL]: +msg = str(e) + +# This error happens if another RPC preempts this RPC. +if "INVALID_CURSOR.DISCONNECTED" in msg: +return True + +if e.code() == grpc.StatusCode.UNAVAILABLE: +return True + +return False + def __init__( self, connection: Union[str, ChannelBuilder], - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44835][CONNECT] Make INVALID_CURSOR.DISCONNECTED a retriable error
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new f13743de04e [SPARK-44835][CONNECT] Make INVALID_CURSOR.DISCONNECTED a retriable error f13743de04e is described below commit f13743de04e430e59c4eaeca464447608bd32b1d Author: Juliusz Sompolski AuthorDate: Thu Sep 7 10:48:28 2023 +0900 [SPARK-44835][CONNECT] Make INVALID_CURSOR.DISCONNECTED a retriable error ### What changes were proposed in this pull request? Make INVALID_CURSOR.DISCONNECTED a retriable error. ### Why are the changes needed? This error can happen if two RPCs are racing to reattach to the query, and the client is still using the losing one. SPARK-44833 was a bug that exposed such a situation. That was fixed, but to be more robust, we can make this error retryable. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tests will be added in https://github.com/apache/spark/pull/42560 ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42818 from juliuszsompolski/SPARK-44835. Authored-by: Juliusz Sompolski Signed-off-by: Hyukjin Kwon --- .../sql/connect/client/GrpcRetryHandler.scala | 17 +++- python/pyspark/sql/connect/client/core.py | 31 +++--- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala index a6841e7f118..8791530607c 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/GrpcRetryHandler.scala @@ -217,7 +217,22 @@ private[sql] object GrpcRetryHandler extends Logging { */ private[client] def retryException(e: Throwable): Boolean = { e match { - case e: StatusRuntimeException => e.getStatus.getCode == Status.Code.UNAVAILABLE + case e: StatusRuntimeException => +val statusCode: Status.Code = e.getStatus.getCode + +if (statusCode == Status.Code.INTERNAL) { + val msg: String = e.toString + + // This error happens if another RPC preempts this RPC. + if (msg.contains("INVALID_CURSOR.DISCONNECTED")) { +return true + } +} + +if (statusCode == Status.Code.UNAVAILABLE) { + return true +} +false case _ => false } } diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 1e439b8c0f6..e8d598bd0fe 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -585,11 +585,36 @@ class SparkConnectClient(object): @classmethod def retry_exception(cls, e: Exception) -> bool: -if isinstance(e, grpc.RpcError): -return e.code() == grpc.StatusCode.UNAVAILABLE -else: +""" +Helper function that is used to identify if an exception thrown by the server +can be retried or not. + +Parameters +-- +e : Exception +The GRPC error as received from the server. Typed as Exception, because other exception +thrown during client processing can be passed here as well. + +Returns +--- +True if the exception can be retried, False otherwise. + +""" +if not isinstance(e, grpc.RpcError): return False +if e.code() in [grpc.StatusCode.INTERNAL]: +msg = str(e) + +# This error happens if another RPC preempts this RPC. +if "INVALID_CURSOR.DISCONNECTED" in msg: +return True + +if e.code() == grpc.StatusCode.UNAVAILABLE: +return True + +return False + def __init__( self, connection: Union[str, ChannelBuilder], - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.5 updated: [SPARK-44640][PYTHON][FOLLOW-UP][3.5] Update UDTF error messages to include method name
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new a9d601cf357 [SPARK-44640][PYTHON][FOLLOW-UP][3.5] Update UDTF error messages to include method name a9d601cf357 is described below commit a9d601cf35706c61e30ef1f1daae34a51e6bb3b0 Author: allisonwang-db AuthorDate: Thu Sep 7 10:42:20 2023 +0900 [SPARK-44640][PYTHON][FOLLOW-UP][3.5] Update UDTF error messages to include method name (cherry picked from commit 3e22c8653d728a6b8523051faddcca437accfc22) ### What changes were proposed in this pull request? This PR is a follow-up for SPARK-44640 to make the error message of a few UDTF errors more informative by including the method name in the error message (`eval` or `terminate`). ### Why are the changes needed? To improve error messages. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No Closes #42840 from allisonwang-db/spark-44640-follow-up-3.5. Authored-by: allisonwang-db Signed-off-by: Hyukjin Kwon --- python/pyspark/errors/error_classes.py | 8 python/pyspark/sql/tests/test_udtf.py | 21 +++ python/pyspark/worker.py | 37 +- 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index 4709f01ba06..0fbe489f623 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -244,7 +244,7 @@ ERROR_CLASSES_JSON = """ }, "INVALID_ARROW_UDTF_RETURN_TYPE" : { "message" : [ - "The return type of the arrow-optimized Python UDTF should be of type 'pandas.DataFrame', but the function returned a value of type with value: ." + "The return type of the arrow-optimized Python UDTF should be of type 'pandas.DataFrame', but the '' method returned a value of type with value: ." ] }, "INVALID_BROADCAST_OPERATION": { @@ -730,17 +730,17 @@ ERROR_CLASSES_JSON = """ }, "UDTF_INVALID_OUTPUT_ROW_TYPE" : { "message" : [ -"The type of an individual output row in the UDTF is invalid. Each row should be a tuple, list, or dict, but got ''. Please make sure that the output rows are of the correct type." +"The type of an individual output row in the '' method of the UDTF is invalid. Each row should be a tuple, list, or dict, but got ''. Please make sure that the output rows are of the correct type." ] }, "UDTF_RETURN_NOT_ITERABLE" : { "message" : [ - "The return value of the UDTF is invalid. It should be an iterable (e.g., generator or list), but got ''. Please make sure that the UDTF returns one of these types." + "The return value of the '' method of the UDTF is invalid. It should be an iterable (e.g., generator or list), but got ''. Please make sure that the UDTF returns one of these types." ] }, "UDTF_RETURN_SCHEMA_MISMATCH" : { "message" : [ - "The number of columns in the result does not match the specified schema. Expected column count: , Actual column count: . Please make sure the values returned by the function have the same number of columns as specified in the output schema." + "The number of columns in the result does not match the specified schema. Expected column count: , Actual column count: . Please make sure the values returned by the '' method have the same number of columns as specified in the output schema." ] }, "UDTF_RETURN_TYPE_MISMATCH" : { diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py index 1ff9e55dd78..944ce6d85b8 100644 --- a/python/pyspark/sql/tests/test_udtf.py +++ b/python/pyspark/sql/tests/test_udtf.py @@ -164,6 +164,27 @@ class BaseUDTFTestsMixin: with self.assertRaisesRegex(PythonException, "UDTF_RETURN_NOT_ITERABLE"): TestUDTF(lit(1)).collect() +def test_udtf_with_zero_arg_and_invalid_return_value(self): +@udtf(returnType="x: int") +class TestUDTF: +def eval(self): +return 1 + +with self.assertRaisesRegex(PythonException, "UDTF_RETURN_NOT_ITERABLE"): +TestUDTF().collect() + +def test_udtf_with_invalid_return_value_in_terminate(self): +@udtf(returnType="x: int") +class TestUDTF: +def eval(self, a): +... + +def terminate(self): +return 1 + +with self.assertRaisesRegex(PythonException, "UDTF_RETURN_NOT_ITERABLE"): +TestUDTF(lit(1)).collect() + def test_udtf_eval_with_no_ret
[spark] branch master updated (e82805da876 -> a0a78b6081f)
This is an automated email from the ASF dual-hosted git repository. kabhwan pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from e82805da876 [SPARK-45050][SQL][CONNECT] Improve error message for UNKNOWN io.grpc.StatusRuntimeException add a0a78b6081f [SPARK-45025][CORE] Allow block manager memory store iterator to handle thread interrupt and perform task completion gracefully No new revisions were added by this update. Summary of changes: .../apache/spark/storage/memory/MemoryStore.scala | 21 +++-- 1 file changed, 15 insertions(+), 6 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.5 updated: [SPARK-45050][SQL][CONNECT] Improve error message for UNKNOWN io.grpc.StatusRuntimeException
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 916b6f51af2 [SPARK-45050][SQL][CONNECT] Improve error message for UNKNOWN io.grpc.StatusRuntimeException 916b6f51af2 is described below commit 916b6f51af213da1fa608231ffdad97004fcbd74 Author: Yihong He AuthorDate: Thu Sep 7 09:02:38 2023 +0900 [SPARK-45050][SQL][CONNECT] Improve error message for UNKNOWN io.grpc.StatusRuntimeException ### What changes were proposed in this pull request? - Improve error message for UNKNOWN io.grpc.StatusRuntimeException Before: ``` [info] - handle unknown exception *** FAILED *** (15 milliseconds) [info] org.apache.spark.SparkException: [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.toThrowable(GrpcExceptionConverter.scala:110) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.convert(GrpcExceptionConverter.scala:41) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$$anon$1.hasNext(GrpcExceptionConverter.scala:49) [info] at org.apache.spark.sql.connect.client.SparkResult.org$apache$spark$sql$connect$client$SparkResult$$processResponses(SparkResult.scala:83) [info] at org.apache.spark.sql.connect.client.SparkResult.length(SparkResult.scala:153) [info] at org.apache.spark.sql.connect.client.SparkResult.toArray(SparkResult.scala:183) [info] at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2910) [info] at org.apache.spark.sql.Dataset.withResult(Dataset.scala:3350) [info] at org.apache.spark.sql.Dataset.collect(Dataset.scala:2909) [info] at org.apache.spark.sql.ClientE2ETestSuite.$anonfun$new$19(ClientE2ETestSuite.scala:118) ``` After: ``` [info] - handle unknown exception *** FAILED *** (21 milliseconds) [info] org.apache.spark.SparkException: io.grpc.StatusRuntimeException: UNKNOWN [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.toThrowable(GrpcExceptionConverter.scala:110) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.convert(GrpcExceptionConverter.scala:41) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$$anon$1.hasNext(GrpcExceptionConverter.scala:49) [info] at org.apache.spark.sql.connect.client.SparkResult.org$apache$spark$sql$connect$client$SparkResult$$processResponses(SparkResult.scala:83) [info] at org.apache.spark.sql.connect.client.SparkResult.length(SparkResult.scala:153) [info] at org.apache.spark.sql.connect.client.SparkResult.toArray(SparkResult.scala:183) [info] at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2910) [info] at org.apache.spark.sql.Dataset.withResult(Dataset.scala:3350) [info] at org.apache.spark.sql.Dataset.collect(Dataset.scala:2909) [info] at org.apache.spark.sql.ClientE2ETestSuite.$anonfun$new$19(ClientE2ETestSuite.scala:118) ``` ### Why are the changes needed? - Better readability of the exception message ### Does this PR introduce _any_ user-facing change? - No ### How was this patch tested? - build/sbt "connect-client-jvm/testOnly *ClientE2ETestSuite" ### Was this patch authored or co-authored using generative AI tooling? Closes #42771 from heyihong/SPARK-45050. Authored-by: Yihong He Signed-off-by: Hyukjin Kwon (cherry picked from commit e82805da876672f7e9447ec54e66175f84ea3d36) Signed-off-by: Hyukjin Kwon --- .../test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala | 12 .../spark/sql/connect/client/GrpcExceptionConverter.scala| 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala index fd443b73925..df36b53791a 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala @@ -109,6 +109,18 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM assert(df.collect().length == 501) } + test("handle unknown exception") { +var df = spark.range(1) +val limit = spark.conf.get("spark.connect.grpc.marshallerRecursionLimit").toInt + 1 +for (a <- 1 to limit) { + df = df.union(spark.range(a, a + 1)) +} +val ex = intercept[SparkException] { + df.collect() +} +assert(ex.getMessage.contains("io.grpc.StatusRuntimeException: UNKNOWN")) + } + test("many tables") { withSQLConf("spark.sql.execution
[spark] branch master updated: [SPARK-45050][SQL][CONNECT] Improve error message for UNKNOWN io.grpc.StatusRuntimeException
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new e82805da876 [SPARK-45050][SQL][CONNECT] Improve error message for UNKNOWN io.grpc.StatusRuntimeException e82805da876 is described below commit e82805da876672f7e9447ec54e66175f84ea3d36 Author: Yihong He AuthorDate: Thu Sep 7 09:02:38 2023 +0900 [SPARK-45050][SQL][CONNECT] Improve error message for UNKNOWN io.grpc.StatusRuntimeException ### What changes were proposed in this pull request? - Improve error message for UNKNOWN io.grpc.StatusRuntimeException Before: ``` [info] - handle unknown exception *** FAILED *** (15 milliseconds) [info] org.apache.spark.SparkException: [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.toThrowable(GrpcExceptionConverter.scala:110) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.convert(GrpcExceptionConverter.scala:41) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$$anon$1.hasNext(GrpcExceptionConverter.scala:49) [info] at org.apache.spark.sql.connect.client.SparkResult.org$apache$spark$sql$connect$client$SparkResult$$processResponses(SparkResult.scala:83) [info] at org.apache.spark.sql.connect.client.SparkResult.length(SparkResult.scala:153) [info] at org.apache.spark.sql.connect.client.SparkResult.toArray(SparkResult.scala:183) [info] at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2910) [info] at org.apache.spark.sql.Dataset.withResult(Dataset.scala:3350) [info] at org.apache.spark.sql.Dataset.collect(Dataset.scala:2909) [info] at org.apache.spark.sql.ClientE2ETestSuite.$anonfun$new$19(ClientE2ETestSuite.scala:118) ``` After: ``` [info] - handle unknown exception *** FAILED *** (21 milliseconds) [info] org.apache.spark.SparkException: io.grpc.StatusRuntimeException: UNKNOWN [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.toThrowable(GrpcExceptionConverter.scala:110) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$.convert(GrpcExceptionConverter.scala:41) [info] at org.apache.spark.sql.connect.client.GrpcExceptionConverter$$anon$1.hasNext(GrpcExceptionConverter.scala:49) [info] at org.apache.spark.sql.connect.client.SparkResult.org$apache$spark$sql$connect$client$SparkResult$$processResponses(SparkResult.scala:83) [info] at org.apache.spark.sql.connect.client.SparkResult.length(SparkResult.scala:153) [info] at org.apache.spark.sql.connect.client.SparkResult.toArray(SparkResult.scala:183) [info] at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2910) [info] at org.apache.spark.sql.Dataset.withResult(Dataset.scala:3350) [info] at org.apache.spark.sql.Dataset.collect(Dataset.scala:2909) [info] at org.apache.spark.sql.ClientE2ETestSuite.$anonfun$new$19(ClientE2ETestSuite.scala:118) ``` ### Why are the changes needed? - Better readability of the exception message ### Does this PR introduce _any_ user-facing change? - No ### How was this patch tested? - build/sbt "connect-client-jvm/testOnly *ClientE2ETestSuite" ### Was this patch authored or co-authored using generative AI tooling? Closes #42771 from heyihong/SPARK-45050. Authored-by: Yihong He Signed-off-by: Hyukjin Kwon --- .../test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala | 12 .../spark/sql/connect/client/GrpcExceptionConverter.scala| 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala index f10f5c78ead..06338f33e1d 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala @@ -109,6 +109,18 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM assert(df.collect().length == 501) } + test("handle unknown exception") { +var df = spark.range(1) +val limit = spark.conf.get("spark.connect.grpc.marshallerRecursionLimit").toInt + 1 +for (a <- 1 to limit) { + df = df.union(spark.range(a, a + 1)) +} +val ex = intercept[SparkException] { + df.collect() +} +assert(ex.getMessage.contains("io.grpc.StatusRuntimeException: UNKNOWN")) + } + test("many tables") { withSQLConf("spark.sql.execution.arrow.maxRecordsPerBatch" -> "10") { val numTables = 20 diff --git a/connector/connect/common/src/main/sc
[spark] branch branch-3.3 updated: [SPARK-45079][SQL][3.3] Fix an internal error from `percentile_approx()` on `NULL` accuracy
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 5250ed65cf2 [SPARK-45079][SQL][3.3] Fix an internal error from `percentile_approx()` on `NULL` accuracy 5250ed65cf2 is described below commit 5250ed65cf2c70e4b456c96c1006b854f56ef1f2 Author: Max Gekk AuthorDate: Wed Sep 6 18:56:14 2023 +0300 [SPARK-45079][SQL][3.3] Fix an internal error from `percentile_approx()` on `NULL` accuracy ### What changes were proposed in this pull request? In the PR, I propose to check the `accuracy` argument is not a NULL in `ApproximatePercentile`. And if it is, throw an `AnalysisException` with new error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`. This is a backport of https://github.com/apache/spark/pull/42817. ### Why are the changes needed? To fix the issue demonstrated by the example: ```sql $ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) FROM VALUES (0), (1), (2), (10) AS tab(col); [INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal error. You hit a bug in Spark or the Spark plugins you use. Please, report this bug to the corresponding communities or vendors, and provide the full stack trace. ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test: ``` $ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Authored-by: Max Gekk (cherry picked from commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b) Closes #42835 from MaxGekk/fix-internal-error-in-percentile_approx-3.3. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../expressions/aggregate/ApproximatePercentile.scala | 5 - .../spark/sql/ApproximatePercentileQuerySuite.scala | 19 +++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index d8eccc075a2..b816e4a9719 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -95,7 +95,8 @@ case class ApproximatePercentile( } // Mark as lazy so that accuracyExpression is not evaluated during tree transformation. - private lazy val accuracy: Long = accuracyExpression.eval().asInstanceOf[Number].longValue + private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number] + private lazy val accuracy: Long = accuracyNum.longValue override def inputTypes: Seq[AbstractDataType] = { // Support NumericType, DateType, TimestampType and TimestampNTZType since their internal types @@ -120,6 +121,8 @@ case class ApproximatePercentile( defaultCheck } else if (!percentageExpression.foldable || !accuracyExpression.foldable) { TypeCheckFailure(s"The accuracy or percentage provided must be a constant literal") +} else if (accuracyNum == null) { + TypeCheckFailure("Accuracy value must not be null") } else if (accuracy <= 0 || accuracy > Int.MaxValue) { TypeCheckFailure(s"The accuracy provided must be a literal between (0, ${Int.MaxValue}]" + s" (current value = $accuracy)") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 9237c9e9486..3fd1592a107 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -337,4 +337,23 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession Row(Period.ofMonths(200).normalized(), null, Duration.ofSeconds(200L))) } } + + test("SPARK-45079: NULL arguments of percentile_approx") { +val e1 = intercept[AnalysisException] { + sql( +""" + |SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) + |FROM VALUES (0), (1), (2), (10) AS tab(col); + |""".stripMargin).collect() +} +assert(e1.getMessage.contains("Accuracy value must not be null")) +val e2 = intercept[AnalysisException] { + sql( +""" + |SELECT percentile_approx(col, NULL, 100) + |FROM VALUES (0), (1), (2), (10) AS tab(col); + |""".stripMargin).collect() +} +assert(e2.getMessag
[spark] branch master updated: [BUILD][TEST] Remove obsolete repo of DB2 JDBC driver
This is an automated email from the ASF dual-hosted git repository. yao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0777562c519 [BUILD][TEST] Remove obsolete repo of DB2 JDBC driver 0777562c519 is described below commit 0777562c5197bde95a514102b1e2d0a04f4ab062 Author: Cheng Pan AuthorDate: Wed Sep 6 20:16:19 2023 +0800 [BUILD][TEST] Remove obsolete repo of DB2 JDBC driver ### What changes were proposed in this pull request? Remove obsolete repo https://app.camunda.com/nexus/content/repositories/public/ ### Why are the changes needed? The repo was introduced in SPARK-10521, because the DB2 JDBC driver jars did not exist in Maven Central on that day. Now, the https://app.camunda.com/nexus/content/repositories/public/ sunset and https://repo1.maven.org/maven2/com/ibm/db2/jcc/11.5.8.0/jcc-11.5.8.0.jar available in Maven Central. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GA. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42820 from pan3793/db2-repo. Authored-by: Cheng Pan Signed-off-by: Kent Yao --- connector/docker-integration-tests/pom.xml | 28 project/SparkBuild.scala | 3 +-- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml index 2b21042fb0d..a9c066ede2d 100644 --- a/connector/docker-integration-tests/pom.xml +++ b/connector/docker-integration-tests/pom.xml @@ -34,17 +34,6 @@ docker-integration-tests - - - db - https://app.camunda.com/nexus/content/repositories/public/ - -true -warn - - - - com.spotify @@ -135,23 +124,6 @@ ojdbc8 test - - com.ibm.db2 jcc diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 2f437eeb75c..95eeb3d8d47 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -996,8 +996,7 @@ object Unsafe { object DockerIntegrationTests { // This serves to override the override specified in DependencyOverrides: lazy val settings = Seq( -dependencyOverrides += "com.google.guava" % "guava" % "18.0", -resolvers += "DB2" at "https://app.camunda.com/nexus/content/repositories/public/"; +dependencyOverrides += "com.google.guava" % "guava" % "18.0" ) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44801][SQL][FOLLOWUP] Remove overdue comments for generating plan info in SQLExecution
This is an automated email from the ASF dual-hosted git repository. yao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new cbe36b20a85 [SPARK-44801][SQL][FOLLOWUP] Remove overdue comments for generating plan info in SQLExecution cbe36b20a85 is described below commit cbe36b20a85913a37b7f569c3ce7a586eb9cbc89 Author: Kent Yao AuthorDate: Wed Sep 6 18:24:12 2023 +0800 [SPARK-44801][SQL][FOLLOWUP] Remove overdue comments for generating plan info in SQLExecution ### What changes were proposed in this pull request? This is a followup to clean up comment for generating plan info in SQLExecution, which is currently wrapped with try-catch block. ### Why are the changes needed? overdue and misleading comments cleanup ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing ### Was this patch authored or co-authored using generative AI tooling? no Closes #42825 from yaooqinn/SPARK-44801-F. Authored-by: Kent Yao Signed-off-by: Kent Yao --- .../src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala| 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala index cd949fb21c9..2a44a016d2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala @@ -134,8 +134,6 @@ object SQLExecution extends Logging { description = desc, details = callSite.longForm, physicalPlanDescription = queryExecution.explainString(planDescriptionMode), -// `queryExecution.executedPlan` triggers query planning. If it fails, the exception -// will be caught and reported in the `SparkListenerSQLExecutionEnd` sparkPlanInfo = planInfo, time = System.currentTimeMillis(), modifiedConfigs = redactedConfigs, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.5 updated: [SPARK-44833][CONNECT][PYTHON][FOLLOW-UP] Fix the type annotation for iterator at reattach.py
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new c98ade32386 [SPARK-44833][CONNECT][PYTHON][FOLLOW-UP] Fix the type annotation for iterator at reattach.py c98ade32386 is described below commit c98ade32386ff396c706cab7ece4d287e4b58453 Author: Hyukjin Kwon AuthorDate: Wed Sep 6 16:48:54 2023 +0900 [SPARK-44833][CONNECT][PYTHON][FOLLOW-UP] Fix the type annotation for iterator at reattach.py ### What changes were proposed in this pull request? This PR fixes the type hint of `self._iterator` at `reattach.py` to be `Optional` because `None` is assigned here. ### Why are the changes needed? To fix the CI: ``` starting mypy annotations test... annotations failed mypy checks: python/pyspark/sql/connect/client/reattach.py:149: error: Incompatible types in assignment (expression has type "None", variable has type "Iterator[ExecutePlanResponse]") [assignment] python/pyspark/sql/connect/client/reattach.py:254: error: Incompatible types in assignment (expression has type "None", variable has type "Iterator[ExecutePlanResponse]") [assignment] python/pyspark/sql/connect/client/reattach.py:[25](https://github.com/apache/spark/actions/runs/6093065151/job/16532169212#step:19:26)8: error: Incompatible types in assignment (expression has type "None", variable has type "Iterator[ExecutePlanResponse]") [assignment] Found 3 errors in 1 file (checked 703 source files) ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually ran the script of `./dev/lint-python` ### Was this patch authored or co-authored using generative AI tooling? No Closes #42830 from HyukjinKwon/SPARK-44833-followup. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon (cherry picked from commit 48872d4b3938fd82c4d753549787f6e7c62cce7e) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/connect/client/reattach.py | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/connect/client/reattach.py b/python/pyspark/sql/connect/client/reattach.py index d3765fb6696..7e1e722d5fd 100644 --- a/python/pyspark/sql/connect/client/reattach.py +++ b/python/pyspark/sql/connect/client/reattach.py @@ -94,7 +94,7 @@ class ExecutePlanResponseReattachableIterator(Generator): # Note: This is not retried, because no error would ever be thrown here, and GRPC will only # throw error on first self._has_next(). self._metadata = metadata -self._iterator: Iterator[pb2.ExecutePlanResponse] = iter( +self._iterator: Optional[Iterator[pb2.ExecutePlanResponse]] = iter( self._stub.ExecutePlan(self._initial_request, metadata=metadata) ) @@ -133,7 +133,9 @@ class ExecutePlanResponseReattachableIterator(Generator): with attempt: if self._current is None: try: -self._current = self._call_iter(lambda: next(self._iterator)) +self._current = self._call_iter( +lambda: next(self._iterator) # type: ignore[arg-type] +) except StopIteration: pass @@ -150,7 +152,9 @@ class ExecutePlanResponseReattachableIterator(Generator): # shouldn't change assert not self._result_complete try: -self._current = self._call_iter(lambda: next(self._iterator)) +self._current = self._call_iter( +lambda: next(self._iterator) # type: ignore[arg-type] +) except StopIteration: pass has_next = self._current is not None - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (24b29adcf53 -> 48872d4b393)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 24b29adcf53 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy add 48872d4b393 [SPARK-44833][CONNECT][PYTHON][FOLLOW-UP] Fix the type annotation for iterator at reattach.py No new revisions were added by this update. Summary of changes: python/pyspark/sql/connect/client/reattach.py | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.4 updated: [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new f0b421553bc [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy f0b421553bc is described below commit f0b421553bc1850cc3e8ed5d564da8f6425cd244 Author: Max Gekk AuthorDate: Wed Sep 6 10:32:37 2023 +0300 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy ### What changes were proposed in this pull request? In the PR, I propose to check the `accuracy` argument is not a NULL in `ApproximatePercentile`. And if it is, throw an `AnalysisException` with new error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`. ### Why are the changes needed? To fix the issue demonstrated by the example: ```sql $ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) FROM VALUES (0), (1), (2), (10) AS tab(col); [INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal error. You hit a bug in Spark or the Spark plugins you use. Please, report this bug to the corresponding communities or vendors, and provide the full stack trace. ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test: ``` $ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42817 from MaxGekk/fix-internal-error-in-percentile_approx. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b) Signed-off-by: Max Gekk --- .../aggregate/ApproximatePercentile.scala | 7 - .../sql/ApproximatePercentileQuerySuite.scala | 31 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 1499f358ac4..ebf1085c0c1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -96,7 +96,8 @@ case class ApproximatePercentile( } // Mark as lazy so that accuracyExpression is not evaluated during tree transformation. - private lazy val accuracy: Long = accuracyExpression.eval().asInstanceOf[Number].longValue + private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number] + private lazy val accuracy: Long = accuracyNum.longValue override def inputTypes: Seq[AbstractDataType] = { // Support NumericType, DateType, TimestampType and TimestampNTZType since their internal types @@ -137,6 +138,10 @@ case class ApproximatePercentile( "inputExpr" -> toSQLExpr(accuracyExpression) ) ) +} else if (accuracyNum == null) { + DataTypeMismatch( +errorSubClass = "UNEXPECTED_NULL", +messageParameters = Map("exprName" -> "accuracy")) } else if (accuracy <= 0 || accuracy > Int.MaxValue) { DataTypeMismatch( errorSubClass = "VALUE_OUT_OF_RANGE", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 9237c9e9486..8598e92f029 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -337,4 +337,35 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession Row(Period.ofMonths(200).normalized(), null, Duration.ofSeconds(200L))) } } + + test("SPARK-45079: NULL arguments of percentile_approx") { +checkError( + exception = intercept[AnalysisException] { +sql( + """ +|SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) +|FROM VALUES (0), (1), (2), (10) AS tab(col); +|""".stripMargin).collect() + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL", + parameters = Map( +"exprName" -> "accuracy", +"sqlExpr" -> "\"percentile_approx(col, array(0.5, 0.4, 0.1), NULL)\""), + context = ExpectedContext( +"", "", 8, 57, "percentile_approx(col, array(0.5, 0.4, 0.1), NULL)")) +checkError( + exception = intercept[AnalysisException] { +sql( + """ +|SELECT percentile_approx(col, NULL, 100) +|FROM VALUES (0), (1), (2
[spark] branch branch-3.5 updated: [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 9b750e93035 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy 9b750e93035 is described below commit 9b750e930357eae092420f09ca9366e49dc589e2 Author: Max Gekk AuthorDate: Wed Sep 6 10:32:37 2023 +0300 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy ### What changes were proposed in this pull request? In the PR, I propose to check the `accuracy` argument is not a NULL in `ApproximatePercentile`. And if it is, throw an `AnalysisException` with new error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`. ### Why are the changes needed? To fix the issue demonstrated by the example: ```sql $ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) FROM VALUES (0), (1), (2), (10) AS tab(col); [INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal error. You hit a bug in Spark or the Spark plugins you use. Please, report this bug to the corresponding communities or vendors, and provide the full stack trace. ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test: ``` $ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42817 from MaxGekk/fix-internal-error-in-percentile_approx. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b) Signed-off-by: Max Gekk --- .../aggregate/ApproximatePercentile.scala | 7 - .../sql/ApproximatePercentileQuerySuite.scala | 31 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 3c3afc1c7e7..5b44c3fa31b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -97,7 +97,8 @@ case class ApproximatePercentile( } // Mark as lazy so that accuracyExpression is not evaluated during tree transformation. - private lazy val accuracy: Long = accuracyExpression.eval().asInstanceOf[Number].longValue + private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number] + private lazy val accuracy: Long = accuracyNum.longValue override def inputTypes: Seq[AbstractDataType] = { // Support NumericType, DateType, TimestampType and TimestampNTZType since their internal types @@ -138,6 +139,10 @@ case class ApproximatePercentile( "inputExpr" -> toSQLExpr(accuracyExpression) ) ) +} else if (accuracyNum == null) { + DataTypeMismatch( +errorSubClass = "UNEXPECTED_NULL", +messageParameters = Map("exprName" -> "accuracy")) } else if (accuracy <= 0 || accuracy > Int.MaxValue) { DataTypeMismatch( errorSubClass = "VALUE_OUT_OF_RANGE", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 18e8dd6249b..273e8e08fd7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -339,4 +339,35 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession Row(Period.ofMonths(200).normalized(), null, Duration.ofSeconds(200L))) } } + + test("SPARK-45079: NULL arguments of percentile_approx") { +checkError( + exception = intercept[AnalysisException] { +sql( + """ +|SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) +|FROM VALUES (0), (1), (2), (10) AS tab(col); +|""".stripMargin).collect() + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL", + parameters = Map( +"exprName" -> "accuracy", +"sqlExpr" -> "\"percentile_approx(col, array(0.5, 0.4, 0.1), NULL)\""), + context = ExpectedContext( +"", "", 8, 57, "percentile_approx(col, array(0.5, 0.4, 0.1), NULL)")) +checkError( + exception = intercept[AnalysisException] { +sql( + """ +|SELECT percentile_approx(col, NULL, 100) +|FROM VALUES (0), (1), (2
[spark] branch master updated: [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 24b29adcf53 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy 24b29adcf53 is described below commit 24b29adcf53616067a9fa2ca201e3f4d2f54436b Author: Max Gekk AuthorDate: Wed Sep 6 10:32:37 2023 +0300 [SPARK-45079][SQL] Fix an internal error from `percentile_approx()`on `NULL` accuracy ### What changes were proposed in this pull request? In the PR, I propose to check the `accuracy` argument is not a NULL in `ApproximatePercentile`. And if it is, throw an `AnalysisException` with new error class `DATATYPE_MISMATCH.UNEXPECTED_NULL`. ### Why are the changes needed? To fix the issue demonstrated by the example: ```sql $ spark-sql (default)> SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) FROM VALUES (0), (1), (2), (10) AS tab(col); [INTERNAL_ERROR] The Spark SQL phase analysis failed with an internal error. You hit a bug in Spark or the Spark plugins you use. Please, report this bug to the corresponding communities or vendors, and provide the full stack trace. ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test: ``` $ build/sbt "test:testOnly *.ApproximatePercentileQuerySuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42817 from MaxGekk/fix-internal-error-in-percentile_approx. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../aggregate/ApproximatePercentile.scala | 7 - .../sql/ApproximatePercentileQuerySuite.scala | 31 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 3c3afc1c7e7..5b44c3fa31b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -97,7 +97,8 @@ case class ApproximatePercentile( } // Mark as lazy so that accuracyExpression is not evaluated during tree transformation. - private lazy val accuracy: Long = accuracyExpression.eval().asInstanceOf[Number].longValue + private lazy val accuracyNum = accuracyExpression.eval().asInstanceOf[Number] + private lazy val accuracy: Long = accuracyNum.longValue override def inputTypes: Seq[AbstractDataType] = { // Support NumericType, DateType, TimestampType and TimestampNTZType since their internal types @@ -138,6 +139,10 @@ case class ApproximatePercentile( "inputExpr" -> toSQLExpr(accuracyExpression) ) ) +} else if (accuracyNum == null) { + DataTypeMismatch( +errorSubClass = "UNEXPECTED_NULL", +messageParameters = Map("exprName" -> "accuracy")) } else if (accuracy <= 0 || accuracy > Int.MaxValue) { DataTypeMismatch( errorSubClass = "VALUE_OUT_OF_RANGE", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 18e8dd6249b..273e8e08fd7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -339,4 +339,35 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession Row(Period.ofMonths(200).normalized(), null, Duration.ofSeconds(200L))) } } + + test("SPARK-45079: NULL arguments of percentile_approx") { +checkError( + exception = intercept[AnalysisException] { +sql( + """ +|SELECT percentile_approx(col, array(0.5, 0.4, 0.1), NULL) +|FROM VALUES (0), (1), (2), (10) AS tab(col); +|""".stripMargin).collect() + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_NULL", + parameters = Map( +"exprName" -> "accuracy", +"sqlExpr" -> "\"percentile_approx(col, array(0.5, 0.4, 0.1), NULL)\""), + context = ExpectedContext( +"", "", 8, 57, "percentile_approx(col, array(0.5, 0.4, 0.1), NULL)")) +checkError( + exception = intercept[AnalysisException] { +sql( + """ +|SELECT percentile_approx(col, NULL, 100) +|FROM VALUES (0), (1), (2), (10) AS tab(col); +|""".stripMargin).collect() + }, + errorClass = "DATATYPE_MISMATCH