This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ae518ecb7068 [SPARK-47218][SQL] XML: Changed SchemaOfXml to fail on DROPMALFORMED mode ae518ecb7068 is described below commit ae518ecb7068347f70d947255eb54fdfd5ec8d48 Author: Yousof Hosny <yousof.ho...@databricks.com> AuthorDate: Mon Mar 11 08:40:19 2024 +0900 [SPARK-47218][SQL] XML: Changed SchemaOfXml to fail on DROPMALFORMED mode ### What changes were proposed in this pull request? Changed schema_of_xml should fail with an error on DROPMALFORMED mode to avoid creating schemas out of invalid XML. ### Why are the changes needed? DROPMALFORMED parse mode imply silently dropping the malformed record. But SchemaOfXml is expected to return a schema and may not have a valid schema to return for a malformed record. So DROPMALFORMED cannot be supported.. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45379 from yhosny/xml-parsemode-error. Authored-by: Yousof Hosny <yousof.ho...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../sql/catalyst/expressions/xmlExpressions.scala | 8 +++-- .../sql/execution/datasources/xml/XmlSuite.scala | 36 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala index 800515ca84b5..8cc1c3a89745 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.util.{ArrayData, FailFastMode, FailureSafeParser, GenericArrayData, PermissiveMode} +import org.apache.spark.sql.catalyst.util.{ArrayData, DropMalformedMode, FailFastMode, FailureSafeParser, GenericArrayData, PermissiveMode} import org.apache.spark.sql.catalyst.xml.{StaxXmlGenerator, StaxXmlParser, ValidatorUtil, XmlInferSchema, XmlOptions} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} import org.apache.spark.sql.internal.SQLConf @@ -189,8 +189,12 @@ case class SchemaOfXml( private lazy val xmlFactory = xmlOptions.buildXmlFactory() @transient - private lazy val xmlInferSchema = + private lazy val xmlInferSchema = { + if (xmlOptions.parseMode == DropMalformedMode) { + throw QueryCompilationErrors.parseModeUnsupportedError("schema_of_xml", xmlOptions.parseMode) + } new XmlInferSchema(xmlOptions, caseSensitive = SQLConf.get.caseSensitiveAnalysis) + } @transient private lazy val xml = child.eval().asInstanceOf[UTF8String] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala index 2194f76e7da6..d7dc96184dab 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala @@ -1302,6 +1302,42 @@ class XmlSuite assert(result.select("decoded._corrupt_record").head().getString(0).nonEmpty) } + test("schema_of_xml with DROPMALFORMED parse error test") { + val e = intercept[AnalysisException] { + spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode', 'DROPMALFORMED'))""") + .collect() + } + checkError( + exception = e, + errorClass = "_LEGACY_ERROR_TEMP_1099", + parameters = Map( + "funcName" -> "schema_of_xml", + "mode" -> "DROPMALFORMED", + "permissiveMode" -> "PERMISSIVE", + "failFastMode" -> FailFastMode.name) + ) + } + + test("schema_of_xml with FAILFAST parse error test") { + val e = intercept[SparkException] { + spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode', 'FAILFAST'))""") + .collect() + } + checkError( + exception = e, + errorClass = "_LEGACY_ERROR_TEMP_2165", + parameters = Map( + "failFastMode" -> FailFastMode.name) + ) + } + + test("schema_of_xml with PERMISSIVE check no error test") { + val s = spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode', 'PERMISSIVE'))""") + .collect() + assert(s.head.get(0) == "STRUCT<_corrupt_record: STRING>") + } + + test("from_xml with PERMISSIVE parse mode with no corrupt col schema") { // XML contains error val xmlData = --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org