This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new bc378f4ff5e2 [SPARK-47330][SQL][TESTS] XML: Added XmlExpressionsSuite bc378f4ff5e2 is described below commit bc378f4ff5e2dd87990c9354bd177c0bdc45e96b Author: Yousof Hosny <yousof.ho...@databricks.com> AuthorDate: Wed Mar 20 11:17:17 2024 +0900 [SPARK-47330][SQL][TESTS] XML: Added XmlExpressionsSuite ### What changes were proposed in this pull request? Replicated JsonExpressionsSuite to XmlExpressionsSuite for additional XML unit test coverage. ### Why are the changes needed? Improve XML Expressions unit testing coverage. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Uni tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45445 from yhosny/xml-expressions-suite. Authored-by: Yousof Hosny <yousof.ho...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../catalyst/expressions/XmlExpressionsSuite.scala | 445 +++++++++++++++++++++ 1 file changed, 445 insertions(+) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/XmlExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/XmlExpressionsSuite.scala new file mode 100644 index 000000000000..9a1098515304 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/XmlExpressionsSuite.scala @@ -0,0 +1,445 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import java.text.{DecimalFormat, DecimalFormatSymbols, SimpleDateFormat} +import java.util.{Calendar, Locale, TimeZone} + +import org.scalatest.exceptions.TestFailedException + +import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.sql.{AnalysisException, Row} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.plans.PlanTestBase +import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{PST, UTC, UTC_OPT} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +class XmlExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with PlanTestBase { + test("from_xml escaping") { + val schema = StructType(StructField("\"quote", IntegerType) :: Nil) + GenerateUnsafeProjection.generate( + XmlToStructs(schema, Map.empty, Literal("\"quote"), UTC_OPT) :: Nil) + } + + test("from_xml") { + val xmlData = """<row><a>1</a></row>""" + val schema = StructType(StructField("a", IntegerType) :: Nil) + checkEvaluation( + XmlToStructs(schema, Map.empty, Literal(xmlData), UTC_OPT), + InternalRow(1) + ) + } + + test("from_xml- invalid data") { + val xmlData = """<row><a>1</row>""" + val schema = StructType(StructField("a", IntegerType) :: Nil) + checkEvaluation( + XmlToStructs(schema, Map.empty, Literal(xmlData), UTC_OPT), + InternalRow(null) + ) + + val exception = intercept[TestFailedException] { + checkEvaluation( + XmlToStructs(schema, Map("mode" -> FailFastMode.name), Literal(xmlData), UTC_OPT), + InternalRow(null) + ) + }.getCause + checkError( + exception = exception.asInstanceOf[SparkException], + errorClass = "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", + parameters = Map("badRecord" -> "[null]", "failFastMode" -> "FAILFAST") + ) + } + + test("from_xml - input=array, schema=array, output=row of array") { + val input = s""" + |<row> + | <a>1</a> + | <a>2</a> + |</row>""".stripMargin + val schema = StructType(StructField("a", ArrayType(IntegerType)) :: Nil) + val output = Row(Array(1, 2)) + checkEvaluation(XmlToStructs(schema, Map.empty, Literal(input), UTC_OPT), output) + } + + test("from_xml - input=single element, schema=array, output=array of single element") { + val input = s""" + |<row> + | <a>1</a> + |</row>""".stripMargin + val schema = StructType(StructField("a", ArrayType(IntegerType)) :: Nil) + val output = Row(Array(1)) + checkEvaluation(XmlToStructs(schema, Map.empty, Literal(input), UTC_OPT), output) + } + + test("from_xml - input=empty row, schema=array, output=empty") { + val input = "<row> </row>" + val schema = StructType(StructField("a", ArrayType(IntegerType)) :: Nil) + val output = Row(null) + checkEvaluation(XmlToStructs(schema, Map.empty, Literal(input), UTC_OPT), output) + } + + test("from_xml - input=empty, schema=array, output=empty") { + val input = """<?xml version="1.0"?>""" + val schema = StructType(StructField("a", ArrayType(IntegerType)) :: Nil) + val output = Row(null) + checkEvaluation(XmlToStructs(schema, Map.empty, Literal(input), UTC_OPT), output) + } + + test("from_xml - input=array, schema=struct, output=single row") { + val input = s""" + |<row> + | <a>1</a> + | <a>2</a> + |</row>""".stripMargin + val corrupted = "corrupted" + val schema = new StructType().add("a", IntegerType).add(corrupted, StringType) + val output = InternalRow(2, null) + val options = Map("columnNameOfCorruptRecord" -> corrupted) + checkEvaluation(XmlToStructs(schema, options, Literal(input), UTC_OPT), output) + } + + test("from_xml - input=empty array, schema=struct, output=single row with null") { + val input = s""" + |<row> + | <a></a> + | <a></a> + |</row>""".stripMargin + val schema = StructType(StructField("a", IntegerType) :: Nil) + val output = InternalRow(null) + checkEvaluation(XmlToStructs(schema, Map.empty, Literal(input), UTC_OPT), output) + } + + test("from_xml - input=empty object, schema=struct, output=single row with null") { + val input = s""" + |<row> + | <a></a> + | <a></a> + |</row>""".stripMargin + val schema = StructType(StructField("a", IntegerType) :: Nil) + val output = InternalRow(null) + checkEvaluation(XmlToStructs(schema, Map.empty, Literal(input), UTC_OPT), output) + } + + test("from_xml null input column") { + val schema = StructType(StructField("a", IntegerType) :: Nil) + checkEvaluation( + XmlToStructs(schema, Map.empty, Literal.create(null, StringType), UTC_OPT), + null + ) + } + + test("from_xml with timestamp") { + val schema = StructType(StructField("t", TimestampType) :: Nil) + val xmlData1 = s""" + |<row> + | <t>2016-01-01T00:00:00.123Z</t> + |</row>""".stripMargin + var c = Calendar.getInstance(TimeZone.getTimeZone(UTC)) + c.set(2016, 0, 1, 0, 0, 0) + c.set(Calendar.MILLISECOND, 123) + + + // The result doesn't change because the xml string includes timezone string ("Z" here), + // which means the string represents the timestamp string in the timezone regardless of + // the timeZoneId parameter. + checkEvaluation( + XmlToStructs(schema, Map.empty, Literal.create(xmlData1, StringType), UTC_OPT), + InternalRow(c.getTimeInMillis * 1000L) + ) + + val xmlData2 = s""" + |<row> + | <t>2016-01-01T00:00:00</t> + |</row>""".stripMargin + for (zid <- DateTimeTestUtils.outstandingZoneIds) { + c = Calendar.getInstance(TimeZone.getTimeZone(zid)) + c.set(2016, 0, 1, 0, 0, 0) + c.set(Calendar.MILLISECOND, 0) + + checkEvaluation( + XmlToStructs( + schema, + Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss"), + Literal.create(xmlData2, StringType), + Option(zid.getId)), + InternalRow(c.getTimeInMillis * 1000L) + ) + checkEvaluation( + XmlToStructs( + schema, + Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", + DateTimeUtils.TIMEZONE_OPTION -> zid.getId), + Literal.create(xmlData2, StringType), + UTC_OPT), + InternalRow(c.getTimeInMillis * 1000L) + ) + } + } + + test("from_xml empty input column") { + val schema = StructType(StructField("a", IntegerType) :: Nil) + checkEvaluation( + XmlToStructs(schema, Map.empty, Literal.create(" ", StringType), UTC_OPT), + InternalRow(null) + ) + } + + test("to_xml escaping") { + val schema = StructType(StructField("\"quote", IntegerType) :: Nil) + val struct = Literal.create(create_row(1), schema) + GenerateUnsafeProjection.generate( + StructsToXml(Map.empty, struct, UTC_OPT) :: Nil) + } + + test("to_xml - struct") { + val schema = StructType(StructField("a", IntegerType) :: Nil) + val struct = Literal.create(create_row(1), schema) + checkEvaluation( + StructsToXml(Map.empty, struct, UTC_OPT), + s"""|<ROW> + | <a>1</a> + |</ROW>""".stripMargin + ) + } + + test("to_xml - array") { + val inputSchema = StructType(StructField("a", ArrayType(IntegerType)) :: Nil) + val input = Row(Array(1, 2)) + val output = s"""|<ROW> + | <a>1</a> + | <a>2</a> + |</ROW>""".stripMargin + checkEvaluation( + StructsToXml(Map.empty, Literal.create(input, inputSchema), UTC_OPT), + output) + } + + test("to_xml - row with empty array") { + val inputSchema = StructType(StructField("a", ArrayType(IntegerType)) :: Nil) + val input = Row(Array(null)) + val output = """<ROW/>""" + checkEvaluation( + StructsToXml(Map.empty, Literal.create(input, inputSchema), UTC_OPT), + output) + } + + test("to_xml - empty row") { + val inputSchema = StructType(StructField("a", ArrayType(IntegerType)) :: Nil) + val input = Row(null) + val output = """<ROW/>""" + checkEvaluation( + StructsToXml(Map.empty, Literal.create(input, inputSchema), UTC_OPT), + output) + } + + test("to_xml null input column") { + val schema = StructType(StructField("a", IntegerType) :: Nil) + val struct = Literal.create(null, schema) + checkEvaluation( + StructsToXml(Map.empty, struct, UTC_OPT), + null + ) + } + + test("to_xml with timestamp") { + val schema = StructType(StructField("t", TimestampType) :: Nil) + val c = Calendar.getInstance(TimeZone.getTimeZone(UTC)) + c.set(2016, 0, 1, 0, 0, 0) + c.set(Calendar.MILLISECOND, 0) + val struct = Literal.create(create_row(c.getTimeInMillis * 1000L), schema) + + checkEvaluation( + StructsToXml(Map.empty, struct, UTC_OPT), + s"""|<ROW> + | <t>2016-01-01T00:00:00.000Z</t> + |</ROW>""".stripMargin + ) + checkEvaluation( + StructsToXml(Map.empty, struct, Option(PST.getId)), + s"""|<ROW> + | <t>2015-12-31T16:00:00.000-08:00</t> + |</ROW>""".stripMargin + ) + + checkEvaluation( + StructsToXml( + Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", + DateTimeUtils.TIMEZONE_OPTION -> UTC_OPT.get), + struct, + UTC_OPT), + s"""|<ROW> + | <t>2016-01-01T00:00:00</t> + |</ROW>""".stripMargin + ) + checkEvaluation( + StructsToXml( + Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", + DateTimeUtils.TIMEZONE_OPTION -> PST.getId), + struct, + UTC_OPT), + s"""|<ROW> + | <t>2015-12-31T16:00:00</t> + |</ROW>""".stripMargin + ) + } + + test("to_xml - row with array of maps") { + val inputSchema = StructType( + StructField("f", ArrayType(MapType(StringType, IntegerType))) :: Nil) + val input = Row(Array(Map("a" -> 1), Map("a" -> 2))) + val output = s"""|<ROW> + | <f> + | <a>1</a> + | </f> + | <f> + | <a>2</a> + | </f> + |</ROW>""".stripMargin + checkEvaluation( + StructsToXml(Map.empty, Literal.create(input, inputSchema), UTC_OPT), + output) + } + + test("to_xml - row with single map") { + val inputSchema = StructType(StructField("f", MapType(StringType, IntegerType)) :: Nil) + val input = Row(Map("a" -> 1)) + val output = s"""|<ROW> + | <f> + | <a>1</a> + | </f> + |</ROW>""".stripMargin + checkEvaluation( + StructsToXml(Map.empty, Literal.create(input, inputSchema), UTC_OPT), + output) + } + + test("from_xml missing fields") { + val input = s""" + |<row> + | <a>1</a> + | <c>foo</c> + |</row>""".stripMargin + val xmlSchema = new StructType() + .add("a", LongType, nullable = false) + .add("b", StringType, nullable = false) + .add("c", StringType, nullable = false) + val output = InternalRow(1L, null, UTF8String.fromString("foo")) + val expr = XmlToStructs(xmlSchema, Map.empty, Literal.create(input, StringType), UTC_OPT) + checkEvaluation(expr, output) + val schema = expr.dataType + val schemaToCompare = xmlSchema.asNullable + assert(schemaToCompare == schema) + } + + test("infer schema of xml strings") { + checkEvaluation(new SchemaOfXml(Literal.create("""<ROW><col>0</col></ROW>""")), + "STRUCT<col: BIGINT>") + val input = s"""|<ROW> + | <col0>a</col0> + | <col0>b</col0> + | <col1> + | <col2>b</col2> + | </col1> + |</ROW>""".stripMargin + checkEvaluation( + new SchemaOfXml(Literal.create(input)), + "STRUCT<col0: ARRAY<STRING>, col1: STRUCT<col2: STRING>>") + } + + test("infer schema of Xml strings by using options") { + checkEvaluation( + new SchemaOfXml(Literal.create("""<ROW><col>01</col></ROW>"""), + CreateMap(Seq(Literal.create("allowNumericLeadingZeros"), Literal.create("true")))), + "STRUCT<col: BIGINT>") + } + + test("parse date with locale") { + Seq("en-US", "ru-RU").foreach { langTag => + val locale = Locale.forLanguageTag(langTag) + val date = new SimpleDateFormat("yyyy-MM-dd").parse("2018-11-05") + val schema = new StructType().add("d", DateType) + val dateFormat = "MMM yyyy" + val sdf = new SimpleDateFormat(dateFormat, locale) + val dateStr = s"""|<ROW> + | <d>${sdf.format(date)}</d> + |</ROW>""".stripMargin + val options = Map("dateFormat" -> dateFormat, "locale" -> langTag) + + checkEvaluation( + XmlToStructs(schema, options, Literal.create(dateStr), UTC_OPT), + InternalRow(17836)) // number of days from 1970-01-01 + } + } + + test("verify corrupt column") { + checkExceptionInExpression[AnalysisException]( + XmlToStructs( + schema = StructType.fromDDL("i int, _unparsed boolean"), + options = Map("columnNameOfCorruptRecord" -> "_unparsed"), + child = Literal.create("""{"i":"a"}"""), + timeZoneId = UTC_OPT), + expectedErrMsg = "The field for corrupt records must be string type and nullable") + } + + def decimalInput(langTag: String): (Decimal, String) = { + val decimalVal = new java.math.BigDecimal("1000.001") + val decimalType = new DecimalType(10, 5) + val expected = Decimal(decimalVal, decimalType.precision, decimalType.scale) + val decimalFormat = new DecimalFormat("", + new DecimalFormatSymbols(Locale.forLanguageTag(langTag))) + val input = s"""|<ROW> + | <d>${decimalFormat.format(expected.toBigDecimal)}</d> + |</ROW>""".stripMargin + + (expected, input) + } + + test("parse decimals using locale") { + def checkDecimalParsing(langTag: String): Unit = { + val schema = new StructType().add("d", DecimalType(10, 5)) + val options = Map("locale" -> langTag) + val (expected, input) = decimalInput(langTag) + + checkEvaluation( + XmlToStructs(schema, options, Literal.create(input), UTC_OPT), + InternalRow(expected)) + } + + Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalParsing) + } + + test("inferring the decimal type using locale") { + def checkDecimalInfer(langTag: String, expectedType: String): Unit = { + val options = Map("locale" -> langTag, "prefersDecimal" -> "true") + val (_, input) = decimalInput(langTag) + + checkEvaluation( + SchemaOfXml(Literal.create(input), options), + expectedType) + } + + Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach { + checkDecimalInfer(_, """STRUCT<d: DECIMAL(7,3)>""") + } + } + +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org