This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ae518ecb7068 [SPARK-47218][SQL] XML: Changed SchemaOfXml to fail on 
DROPMALFORMED mode
ae518ecb7068 is described below

commit ae518ecb7068347f70d947255eb54fdfd5ec8d48
Author: Yousof Hosny <yousof.ho...@databricks.com>
AuthorDate: Mon Mar 11 08:40:19 2024 +0900

    [SPARK-47218][SQL] XML: Changed SchemaOfXml to fail on DROPMALFORMED mode
    
    ### What changes were proposed in this pull request?
    
    Changed schema_of_xml should fail with an error on DROPMALFORMED mode to 
avoid creating schemas out of invalid XML.
    
    ### Why are the changes needed?
    
    DROPMALFORMED parse mode imply silently dropping the malformed record. But 
SchemaOfXml is expected to return a schema and may not have a valid schema to 
return for a malformed record. So DROPMALFORMED cannot be supported..
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Unit test.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #45379 from yhosny/xml-parsemode-error.
    
    Authored-by: Yousof Hosny <yousof.ho...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 .../sql/catalyst/expressions/xmlExpressions.scala  |  8 +++--
 .../sql/execution/datasources/xml/XmlSuite.scala   | 36 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
index 800515ca84b5..8cc1c3a89745 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.util.{ArrayData, FailFastMode, 
FailureSafeParser, GenericArrayData, PermissiveMode}
+import org.apache.spark.sql.catalyst.util.{ArrayData, DropMalformedMode, 
FailFastMode, FailureSafeParser, GenericArrayData, PermissiveMode}
 import org.apache.spark.sql.catalyst.xml.{StaxXmlGenerator, StaxXmlParser, 
ValidatorUtil, XmlInferSchema, XmlOptions}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase}
 import org.apache.spark.sql.internal.SQLConf
@@ -189,8 +189,12 @@ case class SchemaOfXml(
   private lazy val xmlFactory = xmlOptions.buildXmlFactory()
 
   @transient
-  private lazy val xmlInferSchema =
+  private lazy val xmlInferSchema = {
+    if (xmlOptions.parseMode == DropMalformedMode) {
+      throw QueryCompilationErrors.parseModeUnsupportedError("schema_of_xml", 
xmlOptions.parseMode)
+    }
     new XmlInferSchema(xmlOptions, caseSensitive = 
SQLConf.get.caseSensitiveAnalysis)
+  }
 
   @transient
   private lazy val xml = child.eval().asInstanceOf[UTF8String]
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
index 2194f76e7da6..d7dc96184dab 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -1302,6 +1302,42 @@ class XmlSuite
     
assert(result.select("decoded._corrupt_record").head().getString(0).nonEmpty)
   }
 
+  test("schema_of_xml with DROPMALFORMED parse error test") {
+    val e = intercept[AnalysisException] {
+       spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode', 
'DROPMALFORMED'))""")
+         .collect()
+    }
+    checkError(
+      exception = e,
+      errorClass = "_LEGACY_ERROR_TEMP_1099",
+      parameters = Map(
+        "funcName" -> "schema_of_xml",
+        "mode" -> "DROPMALFORMED",
+        "permissiveMode" -> "PERMISSIVE",
+        "failFastMode" -> FailFastMode.name)
+    )
+  }
+
+  test("schema_of_xml with FAILFAST parse error test") {
+    val e = intercept[SparkException] {
+       spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode', 
'FAILFAST'))""")
+         .collect()
+    }
+    checkError(
+      exception = e,
+      errorClass = "_LEGACY_ERROR_TEMP_2165",
+      parameters = Map(
+        "failFastMode" -> FailFastMode.name)
+    )
+  }
+
+  test("schema_of_xml with PERMISSIVE check no error test") {
+      val s = spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode', 
'PERMISSIVE'))""")
+        .collect()
+      assert(s.head.get(0) == "STRUCT<_corrupt_record: STRING>")
+  }
+
+
   test("from_xml with PERMISSIVE parse mode with no corrupt col schema") {
     // XML contains error
     val xmlData =


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to