This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new a936522 [SPARK-29758][SQL][2.4] Fix truncation of requested string fields in `json_tuple` a936522 is described below commit a9365221133caadffbbbbce1aae1ace799a588a3 Author: Maxim Gekk <max.g...@gmail.com> AuthorDate: Wed Nov 20 15:32:28 2019 +0800 [SPARK-29758][SQL][2.4] Fix truncation of requested string fields in `json_tuple` ### What changes were proposed in this pull request? In the PR, I propose to remove an optimization in `json_tuple` which causes truncation of results for large requested string fields. ### Why are the changes needed? Spark 2.4 uses Jackson Core 2.6.7 which has a bug in copying string. This bug may lead to truncation of results in some cases. The bug has been already fixed by the commit https://github.com/FasterXML/jackson-core/commit/554f8db0f940b2a53f974852a2af194739d65200 which is a part of Jackson Core since the version 2.7.7. Upgrading Jackson Core up to 2.7.7 or later version is risky. That's why this PR propose to avoid using the buggy methods of Jackson Core 2.6.7. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By new test added to `JsonFunctionsSuite` Closes #26563 from MaxGekk/fix-truncation-by-json_tuple-2.4. Authored-by: Maxim Gekk <max.g...@gmail.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/expressions/jsonExpressions.scala | 5 ----- .../test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala | 10 ++++++++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 6650e45..4cd1a091 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -472,11 +472,6 @@ case class JsonTuple(children: Seq[Expression]) parser.getCurrentToken match { // if the user requests a string field it needs to be returned without enclosing // quotes which is accomplished via JsonGenerator.writeRaw instead of JsonGenerator.write - case JsonToken.VALUE_STRING if parser.hasTextCharacters => - // slight optimization to avoid allocating a String instance, though the characters - // still have to be decoded... Jackson doesn't have a way to access the raw bytes - generator.writeRaw(parser.getTextCharacters, parser.getTextOffset, parser.getTextLength) - case JsonToken.VALUE_STRING => // the normal String case, pass it through to the output without enclosing quotes generator.writeRaw(parser.getText) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index b1f7446..18335ef 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -535,4 +535,14 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext { to_json(struct($"t"), Map("timestampFormat" -> "yyyy-MM-dd HH:mm:ss.SSSSSS"))) checkAnswer(df, Row(s"""{"t":"$s"}""")) } + + test("json_tuple - do not truncate results") { + val len = 2800 + val str = Array.tabulate(len)(_ => "a").mkString + val json_tuple_result = Seq(s"""{"test":"$str"}""").toDF("json") + .withColumn("result", json_tuple('json, "test")) + .select('result) + .as[String].head.length + assert(json_tuple_result === len) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org