This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new de00ac8a05ae [SPARK-47765][SQL] Add SET COLLATION to parser rules de00ac8a05ae is described below commit de00ac8a05aedb3a150c8c10f76d1fe5496b1df3 Author: Mihailo Milosevic <mihailo.milose...@databricks.com> AuthorDate: Fri Apr 12 22:25:06 2024 +0800 [SPARK-47765][SQL] Add SET COLLATION to parser rules ### What changes were proposed in this pull request? Addition of a new statement SET COLLATION collationName. ### Why are the changes needed? Requested by srielau in order to follow other principles for session level defaults (e.g. SET TIME ZONE). ### Does this PR introduce _any_ user-facing change? Users now can use SET COLLATION statement to change session level default collation. ### How was this patch tested? Test added to `CollationSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45946 from mihailom-db/SPARK-47765. Authored-by: Mihailo Milosevic <mihailo.milose...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/util/CollationFactory.java | 17 +++++++++++++++++ .../src/main/resources/error/error-classes.json | 5 +++++ .../apache/spark/internal/config/ConfigBuilder.scala | 4 ++-- ...rror-conditions-invalid-conf-value-error-class.md | 4 ++++ docs/sql-ref-ansi-compliance.md | 1 + .../apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 | 1 + .../spark/sql/catalyst/parser/SqlBaseParser.g4 | 2 ++ .../org/apache/spark/sql/internal/SQLConf.scala | 8 +++++++- .../resources/ansi-sql-2016-reserved-keywords.txt | 1 + .../apache/spark/sql/execution/SparkSqlParser.scala | 12 ++++++++++++ .../sql-tests/results/ansi/keywords.sql.out | 2 ++ .../resources/sql-tests/results/keywords.sql.out | 1 + .../org/apache/spark/sql/internal/SQLConfSuite.scala | 20 +++++++++++++++++++- .../ThriftServerWithSparkContextSuite.scala | 2 +- 14 files changed, 75 insertions(+), 5 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index ff7bc450f851..9786c559da44 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -202,6 +202,23 @@ public final class CollationFactory { return new StringSearch(pattern, target, (RuleBasedCollator) collator); } + /** + * Returns if the given collationName is valid one. + */ + public static boolean isValidCollation(String collationName) { + return collationNameToIdMap.containsKey(collationName.toUpperCase()); + } + + /** + * Returns closest valid name to collationName + */ + public static String getClosestCollation(String collationName) { + Collation suggestion = Collections.min(List.of(collationTable), Comparator.comparingInt( + c -> UTF8String.fromString(c.collationName).levenshteinDistance( + UTF8String.fromString(collationName.toUpperCase())))); + return suggestion.collationName; + } + /** * Returns a collation-unaware StringSearch object for the given pattern and target strings. * While this object does not respect collation, it can be used to find occurrences of the pattern diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 7b13fa4278e4..2a00edb9a4df 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -1881,6 +1881,11 @@ "The value '<confValue>' in the config \"<confName>\" is invalid." ], "subClass" : { + "DEFAULT_COLLATION" : { + "message" : [ + "Cannot resolve the given default collation. Did you mean '<proposal>'?" + ] + }, "TIME_ZONE" : { "message" : [ "Cannot resolve the given timezone." diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala index 303d856ca2c5..1f19e9444d38 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala @@ -117,12 +117,12 @@ private[spark] class TypedConfigBuilder[T]( def checkValue( validator: T => Boolean, errorClass: String, - parameters: Map[String, String]): TypedConfigBuilder[T] = { + parameters: T => Map[String, String]): TypedConfigBuilder[T] = { transform { v => if (!validator(v)) { throw new SparkIllegalArgumentException( errorClass = "INVALID_CONF_VALUE." + errorClass, - messageParameters = parameters ++ Map( + messageParameters = parameters(v) ++ Map( "confValue" -> v.toString, "confName" -> parent.key)) } diff --git a/docs/sql-error-conditions-invalid-conf-value-error-class.md b/docs/sql-error-conditions-invalid-conf-value-error-class.md index ae0975e16116..ac430956340f 100644 --- a/docs/sql-error-conditions-invalid-conf-value-error-class.md +++ b/docs/sql-error-conditions-invalid-conf-value-error-class.md @@ -30,6 +30,10 @@ The value '`<confValue>`' in the config "`<confName>`" is invalid. This error class has the following derived error classes: +## DEFAULT_COLLATION + +Cannot resolve the given default collation. Did you mean '`<proposal>`'? + ## TIME_ZONE Cannot resolve the given timezone. diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 9b933ec1f65c..bf1819b9767b 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -439,6 +439,7 @@ Below is a list of all the keywords in Spark SQL. |CLUSTERED|non-reserved|non-reserved|non-reserved| |CODEGEN|non-reserved|non-reserved|non-reserved| |COLLATE|reserved|non-reserved|reserved| +|COLLATION|reserved|non-reserved|reserved| |COLLECTION|non-reserved|non-reserved|non-reserved| |COLUMN|reserved|non-reserved|reserved| |COLUMNS|non-reserved|non-reserved|non-reserved| diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index f5565f0a63fb..e2b178d34b56 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -129,6 +129,7 @@ CLUSTER: 'CLUSTER'; CLUSTERED: 'CLUSTERED'; CODEGEN: 'CODEGEN'; COLLATE: 'COLLATE'; +COLLATION: 'COLLATION'; COLLECTION: 'COLLECTION'; COLUMN: 'COLUMN'; COLUMNS: 'COLUMNS'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 6e79d4af2f5e..3d008516589b 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -210,6 +210,7 @@ statement | (MSCK)? REPAIR TABLE identifierReference (option=(ADD|DROP|SYNC) PARTITIONS)? #repairTable | op=(ADD | LIST) identifier .*? #manageResource + | SET COLLATION collationName=identifier #setCollation | SET ROLE .*? #failNativeCommand | SET TIME ZONE interval #setTimeZone | SET TIME ZONE timezone #setTimeZone @@ -1662,6 +1663,7 @@ nonReserved | CLUSTERED | CODEGEN | COLLATE + | COLLATION | COLLECTION | COLUMN | COLUMNS diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 55d8b61f8b94..c8a5d997da7d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -772,6 +772,12 @@ object SQLConf { " produced by a builtin function such as to_char or CAST") .version("4.0.0") .stringConf + .checkValue(CollationFactory.isValidCollation, + "DEFAULT_COLLATION", + name => + Map( + "proposal" -> CollationFactory.getClosestCollation(name) + )) .createWithDefault("UTF8_BINARY") val FETCH_SHUFFLE_BLOCKS_IN_BATCH = @@ -2804,7 +2810,7 @@ object SQLConf { "short names are not recommended to use because they can be ambiguous.") .version("2.2.0") .stringConf - .checkValue(isValidTimezone, errorClass = "TIME_ZONE", parameters = Map.empty) + .checkValue(isValidTimezone, errorClass = "TIME_ZONE", parameters = tz => Map.empty) .createWithDefaultFunction(() => TimeZone.getDefault.getID) val WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD = diff --git a/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt b/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt index 47a3f02ac165..46da60b7897b 100644 --- a/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt +++ b/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt @@ -47,6 +47,7 @@ CLOB CLOSE COALESCE COLLATE +COLLATION COLLECT COLUMN COMMIT diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 28bcc33b1cdc..8192be269993 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -142,6 +142,18 @@ class SparkSqlAstBuilder extends AstBuilder { ResetCommand(Some(ctx.configKey().getText)) } + /** + * Create a [[SetCommand]] logical plan to set [[SQLConf.DEFAULT_COLLATION]] + * Example SQL : + * {{{ + * SET COLLATION UNICODE; + * }}} + */ + override def visitSetCollation(ctx: SetCollationContext): LogicalPlan = withOrigin(ctx) { + val key = SQLConf.DEFAULT_COLLATION.key + SetCommand(Some(key -> Some(ctx.identifier.getText.toUpperCase(Locale.ROOT)))) + } + /** * Create a [[SetCommand]] logical plan to set [[SQLConf.SESSION_LOCAL_TIMEZONE]] * Example SQL : diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out index c0b3a1e8cc55..8b4acd12911b 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out @@ -44,6 +44,7 @@ CLUSTER false CLUSTERED false CODEGEN false COLLATE true +COLLATION true COLLECTION false COLUMN true COLUMNS false @@ -356,6 +357,7 @@ CASE CAST CHECK COLLATE +COLLATION COLUMN CONSTRAINT CREATE diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index 70df01e786ce..884f17c23eb0 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -44,6 +44,7 @@ CLUSTER false CLUSTERED false CODEGEN false COLLATE false +COLLATION false COLLECTION false COLUMN false COLUMNS false diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 03f6b9719b9c..18a06e83c076 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.internal -import java.util.TimeZone +import java.util.{Locale, TimeZone} import org.apache.hadoop.fs.Path import org.apache.logging.log4j.Level @@ -505,6 +505,24 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { |""".stripMargin) } + test("SPARK-47765: set collation") { + Seq("UNICODE", "UNICODE_CI", "utf8_binary_lcase", "utf8_binary").foreach { collation => + sql(s"set collation $collation") + assert(spark.conf.get(SQLConf.DEFAULT_COLLATION) === collation.toUpperCase(Locale.ROOT)) + } + + checkError( + exception = intercept[SparkIllegalArgumentException] { + sql(s"SET COLLATION unicode_c").collect() + }, + errorClass = "INVALID_CONF_VALUE.DEFAULT_COLLATION", + parameters = Map( + "confValue" -> "UNICODE_C", + "confName" -> "spark.sql.session.collation.default", + "proposal" -> "UNICODE_CI" + )) + } + test("SPARK-43028: config not found error") { checkError( exception = intercept[SparkNoSuchElementException](spark.conf.get("some.conf")), diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index 26cf62d2323c..51123b17eeec 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BETWEEN,BIGINT,BINARY,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPUTE,CONCATENATE,CONSTRAINT,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DA [...] + assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BETWEEN,BIGINT,BINARY,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPUTE,CONCATENATE,CONSTRAINT,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_US [...] // scalastyle:on line.size.limit } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org