This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 0985f13 [SPARK-31065][SQL] Match schema_of_json to the schema inference of JSON data source 0985f13 is described below commit 0985f13bc66a99319820d0d9ba5b3f2a254f61a4 Author: HyukjinKwon <gurwls...@apache.org> AuthorDate: Tue Mar 10 00:33:32 2020 -0700 [SPARK-31065][SQL] Match schema_of_json to the schema inference of JSON data source This PR proposes two things: 1. Convert `null` to `string` type during schema inference of `schema_of_json` as JSON datasource does. This is a bug fix as well because `null` string is not the proper DDL formatted string and it is unable for SQL parser to recognise it as a type string. We should match it to JSON datasource and return a string type so `schema_of_json` returns a proper DDL formatted string. 2. Let `schema_of_json` respect `dropFieldIfAllNull` option during schema inference. To let `schema_of_json` return a proper DDL formatted string, and respect `dropFieldIfAllNull` option. Yes, it does. ```scala import collection.JavaConverters._ import org.apache.spark.sql.functions._ spark.range(1).select(schema_of_json(lit("""{"id": ""}"""))).show() spark.range(1).select(schema_of_json(lit("""{"id": "a", "drop": {"drop": null}}"""), Map("dropFieldIfAllNull" -> "true").asJava)).show(false) ``` **Before:** ``` struct<id:null> struct<drop:struct<drop:null>,id:string> ``` **After:** ``` struct<id:string> struct<id:string> ``` Manually tested, and unittests were added. Closes #27854 from HyukjinKwon/SPARK-31065. Authored-by: HyukjinKwon <gurwls...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> (cherry picked from commit 815c7929c290d6eed86dc5c924f9f7d48cff179d) Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../sql/catalyst/expressions/jsonExpressions.scala | 13 +++++++- .../spark/sql/catalyst/json/JsonInferSchema.scala | 13 ++++---- .../org/apache/spark/sql/JsonFunctionsSuite.scala | 35 ++++++++++++++++++++++ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 61afdb6..a63e541 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -773,7 +773,18 @@ case class SchemaOfJson( override def eval(v: InternalRow): Any = { val dt = Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, json)) { parser => parser.nextToken() - jsonInferSchema.inferField(parser) + // To match with schema inference from JSON datasource. + jsonInferSchema.inferField(parser) match { + case st: StructType => + jsonInferSchema.canonicalizeType(st, jsonOptions).getOrElse(StructType(Nil)) + case at: ArrayType if at.elementType.isInstanceOf[StructType] => + jsonInferSchema + .canonicalizeType(at.elementType, jsonOptions) + .map(ArrayType(_, containsNull = at.containsNull)) + .getOrElse(ArrayType(StructType(Nil), containsNull = at.containsNull)) + case other: DataType => + jsonInferSchema.canonicalizeType(other, jsonOptions).getOrElse(StringType) + } } UTF8String.fromString(dt.catalogString) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala index 82dd6d0..3dd8694 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala @@ -92,12 +92,10 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable { } json.sparkContext.runJob(mergedTypesFromPartitions, foldPartition, mergeResult) - canonicalizeType(rootType, options) match { - case Some(st: StructType) => st - case _ => - // canonicalizeType erases all empty structs, including the only one we want to keep - StructType(Nil) - } + canonicalizeType(rootType, options) + .find(_.isInstanceOf[StructType]) + // canonicalizeType erases all empty structs, including the only one we want to keep + .getOrElse(StructType(Nil)).asInstanceOf[StructType] } /** @@ -198,7 +196,8 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable { * Recursively canonicalizes inferred types, e.g., removes StructTypes with no fields, * drops NullTypes or converts them to StringType based on provided options. */ - private def canonicalizeType(tpe: DataType, options: JSONOptions): Option[DataType] = tpe match { + private[catalyst] def canonicalizeType( + tpe: DataType, options: JSONOptions): Option[DataType] = tpe match { case at: ArrayType => canonicalizeType(at.elementType, options) .map(t => at.copy(elementType = t)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index fd1e9e3..8cc5c22 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -653,4 +653,39 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { assert(json_tuple_result === len) } } + + test("SPARK-31065: schema_of_json - null and empty strings as strings") { + Seq("""{"id": null}""", """{"id": ""}""").foreach { input => + checkAnswer( + spark.range(1).select(schema_of_json(input)), + Seq(Row("struct<id:string>"))) + } + } + + test("SPARK-31065: schema_of_json - 'dropFieldIfAllNull' option") { + val options = Map("dropFieldIfAllNull" -> "true") + // Structs + checkAnswer( + spark.range(1).select( + schema_of_json( + lit("""{"id": "a", "drop": {"drop": null}}"""), + options.asJava)), + Seq(Row("struct<id:string>"))) + + // Array of structs + checkAnswer( + spark.range(1).select( + schema_of_json( + lit("""[{"id": "a", "drop": {"drop": null}}]"""), + options.asJava)), + Seq(Row("array<struct<id:string>>"))) + + // Other types are not affected. + checkAnswer( + spark.range(1).select( + schema_of_json( + lit("""null"""), + options.asJava)), + Seq(Row("string"))) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org