This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new deb1d1b86162 [SPARK-46394][SQL] Fix spark.catalog.listDatabases() issues on schemas with special characters when `spark.sql.legacy.keepCommandOutputSchema` set to true deb1d1b86162 is described below commit deb1d1b861626cfcb2522d5907f12b1e69cbfb72 Author: Xinyi Yu <xinyi...@databricks.com> AuthorDate: Wed Dec 13 22:18:24 2023 -0800 [SPARK-46394][SQL] Fix spark.catalog.listDatabases() issues on schemas with special characters when `spark.sql.legacy.keepCommandOutputSchema` set to true ### What changes were proposed in this pull request? When the SQL conf `spark.sql.legacy.keepCommandOutputSchema` is set to true: Before: ``` // support there is a xyyu-db-with-hyphen schema in the catalog spark.catalog.listDatabases() [INVALID_IDENTIFIER] The identifier xyyu-db-with-hyphen is invalid. Please, consider quoting it with back-quotes as `xyyu-db-with-hyphen`. SQLSTATE: 42602 (line 1, pos 4) ``` After: ``` spark.catalog.listDatabases() .. `xyyu-db-with-hyphen` .. ``` This PR fixes the issue by falling back to original name when the parsing failed. ### Why are the changes needed? To fix the bug. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Newly added tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44328 from anchovYu/special-char-schema-name-issue. Authored-by: Xinyi Yu <xinyi...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../apache/spark/sql/internal/CatalogImpl.scala | 21 ++++++++--- .../apache/spark/sql/internal/CatalogSuite.scala | 41 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index ca1a2d49d72d..a58f5d358b4c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{CreateTable, LocalRelation, LogicalPlan, OptionList, RecoverPartitions, ShowFunctions, ShowNamespaces, ShowTables, UnresolvedTableSpec, View} import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.connector.catalog.{CatalogManager, SupportsNamespaces, TableCatalog} @@ -49,8 +50,16 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { private def sessionCatalog: SessionCatalog = sparkSession.sessionState.catalog - private def parseIdent(name: String): Seq[String] = { - sparkSession.sessionState.sqlParser.parseMultipartIdentifier(name) + /** + * Helper function for parsing identifiers. + * @param fallbackOnException if true, when parsing fails, return the original name. + */ + private def parseIdent(name: String, fallbackOnException: Boolean = false): Seq[String] = { + try { + sparkSession.sessionState.sqlParser.parseMultipartIdentifier(name) + } catch { + case _: ParseException if fallbackOnException => Seq(name) + } } private def qualifyV1Ident(nameParts: Seq[String]): Seq[String] = { @@ -100,7 +109,9 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { case ShowNamespaces(r: ResolvedNamespace, _, _) => r.catalog }.get val databases = qe.toRdd.collect().map { row => - makeDatabase(Some(catalog.name()), row.getString(0)) + // dbName can either be a quoted identifier (single or multi part) or an unquoted single part + val dbName = row.getString(0) + makeDatabase(Some(catalog.name()), dbName) } CatalogImpl.makeDataset(databases.toImmutableArraySeq, sparkSession) } @@ -424,9 +435,11 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { makeDatabase(None, dbName) } + // when catalogName is specified, dbName should be a valid quoted multi-part identifier, or a + // valid unquoted single part identifier. private def makeDatabase(catalogNameOpt: Option[String], dbName: String): Database = { val idents = catalogNameOpt match { - case Some(catalogName) => catalogName +: parseIdent(dbName) + case Some(catalogName) => catalogName +: parseIdent(dbName, fallbackOnException = true) case None => resolveNamespace(dbName) } val plan = UnresolvedNamespace(idents, fetchMetadata = true) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala index 6522ebad1a4f..15733d1c8bf6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala @@ -167,6 +167,47 @@ class CatalogSuite extends SharedSparkSession with AnalysisTest with BeforeAndAf Set("default", "my_db2")) } + test("list databases with special character") { + Seq(true, false).foreach { legacy => + withSQLConf(SQLConf.LEGACY_KEEP_COMMAND_OUTPUT_SCHEMA.key -> legacy.toString) { + spark.catalog.setCurrentCatalog(CatalogManager.SESSION_CATALOG_NAME) + assert(spark.catalog.listDatabases().collect().map(_.name).toSet == Set("default")) + // use externalCatalog to bypass the database name validation in SessionCatalog + spark.sharedState.externalCatalog.createDatabase(utils.newDb("my-db1"), false) + spark.sharedState.externalCatalog.createDatabase(utils.newDb("my`db2"), false) + assert(spark.catalog.listDatabases().collect().map(_.name).toSet == + Set("default", "`my-db1`", "`my``db2`")) + // TODO: ideally there should be no difference between legacy and non-legacy mode. However, + // in non-legacy mode, the ShowNamespacesExec does the quoting before pattern matching, + // requiring the pattern to be quoted. This is not ideal, we should fix it in the future. + if (legacy) { + assert( + spark.catalog.listDatabases("my*").collect().map(_.name).toSet == + Set("`my-db1`", "`my``db2`") + ) + assert(spark.catalog.listDatabases("`my*`").collect().map(_.name).toSet == Set.empty) + } else { + assert(spark.catalog.listDatabases("my*").collect().map(_.name).toSet == Set.empty) + assert( + spark.catalog.listDatabases("`my*`").collect().map(_.name).toSet == + Set("`my-db1`", "`my``db2`") + ) + } + assert(spark.catalog.listDatabases("you*").collect().map(_.name).toSet == + Set.empty[String]) + dropDatabase("my-db1") + assert(spark.catalog.listDatabases().collect().map(_.name).toSet == + Set("default", "`my``db2`")) + dropDatabase("my`db2") // cleanup + + spark.catalog.setCurrentCatalog("testcat") + sql(s"CREATE NAMESPACE testcat.`my-db`") + assert(spark.catalog.listDatabases().collect().map(_.name).toSet == Set("`my-db`")) + sql(s"DROP NAMESPACE testcat.`my-db`") // cleanup + } + } + } + test("list databases with current catalog") { spark.catalog.setCurrentCatalog("testcat") sql(s"CREATE NAMESPACE testcat.my_db") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org