Re: [PR] [SPARK-46382][SQL] XML: Capture values interspersed between elements [spark]

via GitHub Thu, 21 Dec 2023 11:08:25 -0800


sandip-db commented on code in PR #44318:
URL: https://github.com/apache/spark/pull/44318#discussion_r1434410674



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -243,6 +244,22 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: 
Boolean)
       }
     }
 
+    @tailrec
+    def inferAndCheckEndElement(parser: XMLEventReader): Boolean = {
+      parser.peek match {
+        case _: EndElement | _: EndDocument => true
+        case _: StartElement => false
+        case c: Characters if !c.isWhiteSpace =>
+          val characterType = inferFrom(c.getData)
+          parser.nextEvent()
+          addOrUpdateType(options.valueTag, characterType)

Review Comment:
   Is there a test case for this scenario?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -201,27 +217,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (c.isWhiteSpace) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)
+              case None => null
+            }
+          case _ =>
+            val row = convertObject(parser, st)
+            if (!c.isWhiteSpace) {

Review Comment:
   We need to document the behavior of whitespaces for valueTag. Also, the 
following scenarios, which contain whitespaces with quotes:
   ```
   <ROW><a>" "</a></ROW>
   <ROW><b>" "<c>1</c></b></ROW>
   <ROW><d><e attr=" "></e></d></ROW>
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -201,27 +217,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (c.isWhiteSpace) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)
+              case None => null
+            }
+          case _ =>
+            val row = convertObject(parser, st)

Review Comment:
   Will this handle values separated by comment or cdata? If so, we don't need 
`case _: EndElement` above.
   ```
   <ROW>
     <a> 1 <!--this is a comment--> 2 </a>
   </ROW>
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -534,4 +541,55 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: 
Boolean)
       }
     }
   }
+
+  /**
+   * This helper function merges the data type of value tags and inner 
elements.
+   * It could only be structure data. Consider the following case,
+   * <a>
+   *   value1
+   *   <b>1</b>
+   *   value2
+   * </a>
+   * Input: ''a struct<b int, _VALUE string>'' and ''_VALUE string''
+   * Return: ''a struct<b int, _VALUE array<string>>''
+   * @param objectType inner elements' type
+   * @param valueTagType value tag's type
+   */
+  private[xml] def addOrUpdateValueTagType(
+      objectType: DataType,
+      valueTagType: DataType): DataType = {
+    (objectType, valueTagType) match {
+      case (st: StructType, _) =>
+        // TODO(shujing): case sensitive?

Review Comment:
   while the case for valueTag is unlikely to change, its better to add case 
sensitivity logic to it to make it consistent with other fields. Can be a 
separate PR. Not a high prio.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -534,4 +541,55 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: 
Boolean)
       }
     }
   }
+
+  /**
+   * This helper function merges the data type of value tags and inner 
elements.
+   * It could only be structure data. Consider the following case,
+   * <a>
+   *   value1
+   *   <b>1</b>
+   *   value2
+   * </a>
+   * Input: ''a struct<b int, _VALUE string>'' and ''_VALUE string''
+   * Return: ''a struct<b int, _VALUE array<string>>''
+   * @param objectType inner elements' type
+   * @param valueTagType value tag's type
+   */
+  private[xml] def addOrUpdateValueTagType(
+      objectType: DataType,
+      valueTagType: DataType): DataType = {
+    (objectType, valueTagType) match {
+      case (st: StructType, _) =>
+        // TODO(shujing): case sensitive?
+        val valueTagIndexOpt = st.getFieldIndex(options.valueTag)
+
+        valueTagIndexOpt match {
+          // If the field name exists in the inner elements,
+          // merge the type and infer the combined field as an array type if 
necessary
+          case Some(index) if !st(index).dataType.isInstanceOf[ArrayType] =>
+            updateStructField(
+              st,
+              index,
+              ArrayType(compatibleType(st(index).dataType, valueTagType)))
+          case Some(index) =>
+            updateStructField(st, index, compatibleType(st(index).dataType, 
valueTagType))

Review Comment:
   Won't `st(index).dataType` will be of `ArrayType`?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -534,4 +541,55 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: 
Boolean)
       }
     }
   }
+
+  /**
+   * This helper function merges the data type of value tags and inner 
elements.
+   * It could only be structure data. Consider the following case,
+   * <a>
+   *   value1
+   *   <b>1</b>
+   *   value2
+   * </a>
+   * Input: ''a struct<b int, _VALUE string>'' and ''_VALUE string''
+   * Return: ''a struct<b int, _VALUE array<string>>''
+   * @param objectType inner elements' type
+   * @param valueTagType value tag's type
+   */
+  private[xml] def addOrUpdateValueTagType(
+      objectType: DataType,
+      valueTagType: DataType): DataType = {
+    (objectType, valueTagType) match {
+      case (st: StructType, _) =>
+        // TODO(shujing): case sensitive?
+        val valueTagIndexOpt = st.getFieldIndex(options.valueTag)
+
+        valueTagIndexOpt match {
+          // If the field name exists in the inner elements,
+          // merge the type and infer the combined field as an array type if 
necessary
+          case Some(index) if !st(index).dataType.isInstanceOf[ArrayType] =>
+            updateStructField(
+              st,
+              index,
+              ArrayType(compatibleType(st(index).dataType, valueTagType)))
+          case Some(index) =>
+            updateStructField(st, index, compatibleType(st(index).dataType, 
valueTagType))

Review Comment:
   Lets add this test case scenario where `Array<LongType>` is updated to 
`Array<DoubleType>`:
   ```
   <ROW>
     <a>
       1
       <b>2</b>
       3
       <b>4</b>
       5.0
     </a>
   </ROW>
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -201,27 +217,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (c.isWhiteSpace) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)
+              case None => null
+            }
+          case _ =>
+            val row = convertObject(parser, st)
+            if (!c.isWhiteSpace) {
+              addOrUpdate(row.toSeq(st).toArray, st, options.valueTag, 
c.getData, addToTail = false)

Review Comment:
   Why `addToTail` is false here?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -194,27 +210,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values 
are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element 
whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some 
character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || 
f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex 
structure.
-          // Just return the value of the character element, or null if we 
don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing 
the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (isEmptyString(c)) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)

Review Comment:
   yes, I get that. It looks like the assumption is that `convertField` will 
either return a `Row` or a singleton valueTag with just value.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-46382][SQL] XML: Capture values interspersed between elements [spark]

Reply via email to