This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5badb2446fa [SPARK-41228][SQL] Rename & Improve error message for `COLUMN_NOT_IN_GROUP_BY_CLAUSE` 5badb2446fa is described below commit 5badb2446fa2b51e8ea239ced6c9b44178b2f1fa Author: itholic <haejoon....@databricks.com> AuthorDate: Thu Dec 1 09:18:17 2022 +0300 [SPARK-41228][SQL] Rename & Improve error message for `COLUMN_NOT_IN_GROUP_BY_CLAUSE` ### What changes were proposed in this pull request? This PR proposes to rename `COLUMN_NOT_IN_GROUP_BY_CLAUSE` to `MISSING_AGGREGATION`. Also, improve its error message. ### Why are the changes needed? The current error class name and its error message doesn't illustrate the error cause and resolution correctly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? ``` ./build/sbt “sql/testOnly org.apache.spark.sql.SQLQueryTestSuite*” ``` Closes #38769 from itholic/SPARK-41128. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- core/src/main/resources/error/error-classes.json | 13 +++++++------ .../sql/tests/pandas/test_pandas_udf_grouped_agg.py | 2 +- .../apache/spark/sql/errors/QueryCompilationErrors.scala | 7 +++++-- .../spark/sql/catalyst/analysis/AnalysisErrorSuite.scala | 7 +++++-- .../src/test/resources/sql-tests/results/extract.sql.out | 2 ++ .../resources/sql-tests/results/group-by-filter.sql.out | 10 ++++++---- .../src/test/resources/sql-tests/results/group-by.sql.out | 15 +++++++++------ .../test/resources/sql-tests/results/grouping_set.sql.out | 5 +++-- .../sql-tests/results/postgreSQL/create_view.sql.out | 5 +++-- .../sql-tests/results/udaf/udaf-group-by-ordinal.sql.out | 15 +++++++++------ .../sql-tests/results/udaf/udaf-group-by.sql.out | 15 +++++++++------ .../resources/sql-tests/results/udf/udf-group-by.sql.out | 15 +++++++++------ .../org/apache/spark/sql/execution/SQLViewSuite.scala | 5 +++-- 13 files changed, 71 insertions(+), 45 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index a79c02e1f1d..65b6dc68d12 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -109,12 +109,6 @@ "The column <columnName> already exists. Consider to choose another name or rename the existing column." ] }, - "COLUMN_NOT_IN_GROUP_BY_CLAUSE" : { - "message" : [ - "The expression <expression> is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in `first()` (or `first_value()`) if you don't care which value you get." - ], - "sqlState" : "42000" - }, "CONCURRENT_QUERY" : { "message" : [ "Another instance of this query was just started by a concurrent session." @@ -830,6 +824,13 @@ "Malformed Protobuf messages are detected in message deserialization. Parse Mode: <failFastMode>. To process malformed protobuf message as null result, try setting the option 'mode' as 'PERMISSIVE'." ] }, + "MISSING_AGGREGATION" : { + "message" : [ + "The non-aggregating expression <expression> is based on columns which are not participating in the GROUP BY clause.", + "Add the columns or the expression to the GROUP BY, aggregate the expression, or use <expressionAnyValue> if you do not care which of the values within a group is returned." + ], + "sqlState" : "42000" + }, "MISSING_STATIC_PARTITION_COLUMN" : { "message" : [ "Unknown static partition column: <columnName>" diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py index 6f475624b74..aa844fc5fd5 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py @@ -475,7 +475,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): mean_udf = self.pandas_agg_mean_udf with QuietTest(self.sc): - with self.assertRaisesRegex(AnalysisException, "nor.*aggregate function"): + with self.assertRaisesRegex(AnalysisException, "[MISSING_AGGREGATION]"): df.groupby(df.id).agg(plus_one(df.v)).collect() with QuietTest(self.sc): diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index e5b1c3c100d..fc9a08104b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableExceptio import org.apache.spark.sql.catalyst.catalog.{CatalogTable, InvalidUDFClassException} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, AttributeSet, CreateMap, CreateStruct, Expression, GroupingID, NamedExpression, SpecifiedWindowFrame, WindowFrame, WindowFunction, WindowSpecDefinition} +import org.apache.spark.sql.catalyst.expressions.aggregate.AnyValue import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, Join, LogicalPlan, SerdeInfo, Window} import org.apache.spark.sql.catalyst.trees.{Origin, TreeNode} @@ -3203,8 +3204,10 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def columnNotInGroupByClauseError(expression: Expression): Throwable = { new AnalysisException( - errorClass = "COLUMN_NOT_IN_GROUP_BY_CLAUSE", - messageParameters = Map("expression" -> toSQLExpr(expression)) + errorClass = "MISSING_AGGREGATION", + messageParameters = Map( + "expression" -> toSQLExpr(expression), + "expressionAnyValue" -> toSQLExpr(new AnyValue(expression))) ) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index ea8a2c3068d..9acdad0a428 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -359,10 +359,13 @@ class AnalysisErrorSuite extends AnalysisTest { testRelation.join(testRelation, condition = Some(Literal(1))), "condition" :: "'1'" :: "not a boolean" :: Literal(1).dataType.simpleString :: Nil) - errorTest( + errorClassTest( "missing group by", testRelation2.groupBy($"a")($"b"), - "\"b\"" :: "COLUMN_NOT_IN_GROUP_BY_CLAUSE" :: Nil + "MISSING_AGGREGATION", + messageParameters = Map( + "expression" -> "\"b\"", + "expressionAnyValue" -> "\"any_value(b)\"") ) errorTest( diff --git a/sql/core/src/test/resources/sql-tests/results/extract.sql.out b/sql/core/src/test/resources/sql-tests/results/extract.sql.out index 30aa0ea04ea..863b568be1e 100644 --- a/sql/core/src/test/resources/sql-tests/results/extract.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/extract.sql.out @@ -376,6 +376,7 @@ org.apache.spark.sql.AnalysisException } ] } + -- !query select date_part('year', c), date_part('year', ntz), date_part('year', i) from t -- !query schema @@ -1147,6 +1148,7 @@ org.apache.spark.sql.AnalysisException } ] } + -- !query select date_part('not_supported', interval '123 12:34:56.789123123' DAY TO SECOND) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out index 01033fa564a..5b0231809e8 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out @@ -242,10 +242,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -728,10 +729,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 52138aa9866..a6202b0c1d7 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -57,10 +57,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -127,10 +128,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -243,10 +245,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"k\"" + "expression" : "\"k\"", + "expressionAnyValue" : "\"any_value(k)\"" } } diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out index 83b721373da..8112dbf4e5a 100644 --- a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out @@ -167,10 +167,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"c1\"" + "expression" : "\"c1\"", + "expressionAnyValue" : "\"any_value(c1)\"" } } diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out index 1e1c5b36a62..d39c32399b6 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out @@ -54,10 +54,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"data\"" + "expression" : "\"data\"", + "expressionAnyValue" : "\"any_value(data)\"" } } diff --git a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by-ordinal.sql.out index b696de04e9a..a1b9a3e91c5 100644 --- a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by-ordinal.sql.out @@ -93,10 +93,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -108,10 +109,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -349,10 +351,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"b\"" + "expression" : "\"b\"", + "expressionAnyValue" : "\"any_value(b)\"" } } diff --git a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by.sql.out index a2b6b03b2b2..8ed164820c6 100644 --- a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-by.sql.out @@ -57,10 +57,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -119,10 +120,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -234,10 +236,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"k\"" + "expression" : "\"k\"", + "expressionAnyValue" : "\"any_value(k)\"" } } diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out index a7524744372..61c47255eda 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out @@ -57,10 +57,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -127,10 +128,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"a\"" + "expression" : "\"a\"", + "expressionAnyValue" : "\"any_value(a)\"" } } @@ -220,10 +222,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + "errorClass" : "MISSING_AGGREGATION", "sqlState" : "42000", "messageParameters" : { - "expression" : "\"k\"" + "expression" : "\"k\"", + "expressionAnyValue" : "\"any_value(k)\"" } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index bb14e2c32d3..d3b43059d35 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -952,9 +952,10 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { sql("SELECT * FROM v3") } checkError(e, - errorClass = "COLUMN_NOT_IN_GROUP_BY_CLAUSE", + errorClass = "MISSING_AGGREGATION", parameters = Map( - "expression" -> "\"c1\"")) + "expression" -> "\"c1\"", + "expressionAnyValue" -> "\"any_value(c1)\"")) } withSQLConf(GROUP_BY_ALIASES.key -> "false") { val e = intercept[AnalysisException] { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org