Re: [PR] [SPARK-46382][SQL] XML: Capture values interspersed between elements [spark]

via GitHub Thu, 14 Dec 2023 23:45:53 -0800


sandip-db commented on code in PR #44318:
URL: https://github.com/apache/spark/pull/44318#discussion_r1427373383



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -567,6 +589,61 @@ class StaxXmlParser(
       castTo(data, FloatType).asInstanceOf[Float]
     }
   }
+  private[xml] def isEmptyString(c: Characters): Boolean = {
+    if (options.ignoreSurroundingSpaces) {
+      c.getData.trim.isEmpty
+    } else {
+      c.isWhiteSpace
+    }
+  }
+
+  @tailrec
+  private def parseAndCheckEndElement(
+      row: Array[Any],
+      schema: StructType,
+      parser: XMLEventReader): Boolean = {
+    parser.peek match {
+      case _: EndElement | _: EndDocument => true
+      case _: StartElement => false
+      case c: Characters if !isEmptyString(c) =>
+        parser.nextEvent()
+        addOrUpdate(row, schema, options.valueTag, c.getData)
+        parseAndCheckEndElement(row, schema, parser)
+      case _ =>
+        parser.nextEvent()
+        parseAndCheckEndElement(row, schema, parser)
+    }
+  }
+
+  private def addOrUpdate(
+      row: Array[Any],
+      schema: StructType,
+      name: String,
+      string: String,

Review Comment:
   ```suggestion
         data: String,
   ```



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala:
##########
@@ -2178,4 +2166,49 @@ class XmlSuite extends QueryTest with SharedSparkSession 
{
     )
     testWriteReadRoundTrip(df, Map("nullValue" -> "null", "prefersDecimal" -> 
"true"))
   }
+
+  test("capture values interspersed between elements - simple") {
+    val df = spark.read.format("xml")
+      .option("rowTag", "ROW")
+      .option("multiLine", "true")
+      .load(getTestResourcePath(resDir + "values-simple.xml"))

Review Comment:
   Simple XML data can be embedded in test suite itself like:  [using 
spark.createDataset](https://github.com/apache/spark/blob/c045a425bf0c472f164e3ef75a8a2c68d72d61d3/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala#L1537)
 or [writing to a temp 
file](https://github.com/apache/spark/blob/c045a425bf0c472f164e3ef75a8a2c68d72d61d3/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala#L2294)



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -255,7 +275,12 @@ class StaxXmlParser(
         case e: StartElement =>
           kvPairs +=
             
(UTF8String.fromString(StaxXmlParserUtils.getName(e.asStartElement.getName, 
options)) ->
-             convertField(parser, valueType))
+            convertField(parser, valueType))
+        case c: Characters if !isEmptyString(c) =>

Review Comment:
   ```suggestion
           case c: Characters if !c.isWhiteSpace =>
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -194,27 +210,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (isEmptyString(c)) {

Review Comment:
   Lets not allow any whitespace values for `valueTag`.
   
   ```suggestion
               if (!c.isWhiteSpace) {
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala:
##########
@@ -70,7 +69,6 @@ object StaxXmlParserUtils {
   /**
    * Checks if current event points the EndElement.
    */
-  @tailrec

Review Comment:
   Why was this removed?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -171,16 +171,18 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: 
Boolean)
           case _: EndElement => StringType
           case _ => inferField(parser)
         }
-      case c: Characters if !c.isWhiteSpace =>
+      // what about new line character
+      case c: Characters if !isEmptyString(c) =>

Review Comment:
   For this case, can't we return `inferObject(parser)`?
   In `inferObject(parser)`, the case for `StructType` can be updated to 
"unnest" `StructType` with just `valueTag`.
   Without this, there is lot of code duplication logic for valueTag.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -194,27 +210,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (isEmptyString(c)) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)
+              case None => null
+            }
+          case _ =>
+            val row = convertObject(parser, st)
+            if (!isEmptyString(c)) {

Review Comment:
   ```suggestion
               if (!c.isWhiteSpace) {
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -194,27 +210,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (isEmptyString(c)) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)
+              case None => null
+            }
+          case _ =>
+            val row = convertObject(parser, st)
+            if (!isEmptyString(c)) {
+              addOrUpdate(row.toSeq(st).toArray, st, options.valueTag, 
c.getData, addToTail = false)
+            } else {
+              row
+            }
         }
       case (_: Characters, _: StringType) =>

Review Comment:
   Is `parser.next` not required here?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -567,6 +589,61 @@ class StaxXmlParser(
       castTo(data, FloatType).asInstanceOf[Float]
     }
   }
+  private[xml] def isEmptyString(c: Characters): Boolean = {
+    if (options.ignoreSurroundingSpaces) {
+      c.getData.trim.isEmpty

Review Comment:
   this is same as c.isWhiteSpace



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala:
##########
@@ -1729,9 +1729,15 @@ class XmlSuite extends QueryTest with SharedSparkSession 
{
     val TAG_NAME = "tag"
     val VALUETAG_NAME = "_VALUE"
     val schema = buildSchema(
+      field(VALUETAG_NAME),

Review Comment:
   Why the fields were rearranged? 



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -194,27 +210,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (isEmptyString(c)) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)

Review Comment:
   Shouldn't this be returning a `Row`? If yes, please make sure that there is 
a test scenario covering this.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -398,15 +424,11 @@ class StaxXmlParser(
             badRecordException = badRecordException.orElse(Some(e))
         }
 
-        case c: Characters if !c.isWhiteSpace && isRootAttributesOnly =>
-          nameToIndex.get(options.valueTag) match {
-            case Some(index) =>
-              row(index) = convertTo(c.getData, schema(index).dataType)
-            case None => // do nothing
-          }
+        case c: Characters if !isEmptyString(c) =>

Review Comment:
   ```suggestion
           case c: Characters if !c.isWhiteSpace =>
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -159,7 +159,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: 
Boolean)
     parser.peek match {
       case _: EndElement => NullType
       case _: StartElement => inferObject(parser)
-      case c: Characters if c.isWhiteSpace =>
+      case c: Characters if isEmptyString(c) =>

Review Comment:
   ```suggestion
         case c: Characters if c.isWhiteSpace =>
   ```



##########
sql/core/src/test/resources/test-data/xml-resources/values-simple.xml:
##########
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<ROWSET>
+    <ROW>
+        value1
+        <a>
+            value2
+            <b>1</b>
+            value3
+        </a>
+    </ROW>

Review Comment:
   ```suggestion
           </a>
           value4
       </ROW>
   ```



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala:
##########
@@ -1145,18 +1145,18 @@ class XmlSuite extends QueryTest with 
SharedSparkSession {
       .option("inferSchema", true)
       .xml(getTestResourcePath(resDir + "mixed_children.xml"))
     val mixedRow = mixedDF.head()
-    assert(mixedRow.getAs[Row](0).toSeq === Seq(" lorem "))
-    assert(mixedRow.getString(1) === " ipsum ")
+    assert(mixedRow.getAs[Row](0) === Row(List("issue", "text ignored"), 
"lorem"))

Review Comment:
   Update `text ignored` with something else.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-46382][SQL] XML: Capture values interspersed between elements [spark]

Reply via email to