[spark] branch master updated: [SPARK-42881][SQL] Codegen Support for get_json_object

maxgekk Wed, 11 Oct 2023 07:43:08 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new c2525308330 [SPARK-42881][SQL] Codegen Support for get_json_object
c2525308330 is described below

commit c252530833097759b1f943ff89b05f22025f0dd0
Author: panbingkun <pbk1...@gmail.com>
AuthorDate: Wed Oct 11 17:42:48 2023 +0300

    [SPARK-42881][SQL] Codegen Support for get_json_object
    
    ### What changes were proposed in this pull request?
    The PR adds Codegen Support for get_json_object.
    
    ### Why are the changes needed?
    Improve codegen coverage and performance.
    Github benchmark 
data(https://github.com/panbingkun/spark/actions/runs/4497396473/jobs/7912952710):
    <img width="879" alt="image" 
src="https://user-images.githubusercontent.com/15246973/227117793-bab38c42-dcc1-46de-a689-25a87b8f3561.png";>
    
    Local benchmark data:
    <img width="895" alt="image" 
src="https://user-images.githubusercontent.com/15246973/227098745-9b360e60-fe84-4419-8b7d-073a0530816a.png";>
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Add new UT.
    Pass GA.
    
    Closes #40506 from panbingkun/json_code_gen.
    
    Authored-by: panbingkun <pbk1...@gmail.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../sql/catalyst/expressions/jsonExpressions.scala | 121 +++++++++++++++++---
 sql/core/benchmarks/JsonBenchmark-results.txt      | 127 +++++++++++----------
 .../org/apache/spark/sql/JsonFunctionsSuite.scala  |  28 +++++
 .../execution/datasources/json/JsonBenchmark.scala |  15 ++-
 4 files changed, 208 insertions(+), 83 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index e7df542ddab..04bc457b66a 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -28,7 +28,8 @@ import com.fasterxml.jackson.core.json.JsonReadFeature
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, 
CodeGenerator, CodegenFallback, ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
 import org.apache.spark.sql.catalyst.json._
 import org.apache.spark.sql.catalyst.trees.TreePattern.{JSON_TO_STRUCT, 
TreePattern}
 import org.apache.spark.sql.catalyst.util._
@@ -125,13 +126,7 @@ private[this] object SharedFactory {
   group = "json_funcs",
   since = "1.5.0")
 case class GetJsonObject(json: Expression, path: Expression)
-  extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
-
-  import com.fasterxml.jackson.core.JsonToken._
-
-  import PathInstruction._
-  import SharedFactory._
-  import WriteStyle._
+  extends BinaryExpression with ExpectsInputTypes {
 
   override def left: Expression = json
   override def right: Expression = path
@@ -140,18 +135,114 @@ case class GetJsonObject(json: Expression, path: 
Expression)
   override def nullable: Boolean = true
   override def prettyName: String = "get_json_object"
 
-  @transient private lazy val parsedPath = 
parsePath(path.eval().asInstanceOf[UTF8String])
+  @transient
+  private lazy val evaluator = if (path.foldable) {
+    new GetJsonObjectEvaluator(path.eval().asInstanceOf[UTF8String])
+  } else {
+    new GetJsonObjectEvaluator()
+  }
 
   override def eval(input: InternalRow): Any = {
-    val jsonStr = json.eval(input).asInstanceOf[UTF8String]
+    evaluator.setJson(json.eval(input).asInstanceOf[UTF8String])
+    if (!path.foldable) {
+      evaluator.setPath(path.eval(input).asInstanceOf[UTF8String])
+    }
+    evaluator.evaluate()
+  }
+
+  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val evaluatorClass = classOf[GetJsonObjectEvaluator].getName
+    val initEvaluator = path.foldable match {
+      case true if path.eval() != null =>
+        val cachedPath = path.eval().asInstanceOf[UTF8String]
+        val refCachedPath = ctx.addReferenceObj("cachedPath", cachedPath)
+        s"new $evaluatorClass($refCachedPath)"
+      case _ => s"new $evaluatorClass()"
+    }
+    val evaluator = ctx.addMutableState(evaluatorClass, "evaluator",
+      v => s"""$v = $initEvaluator;""", forceInline = true)
+
+    val jsonEval = json.genCode(ctx)
+    val pathEval = path.genCode(ctx)
+
+    val setJson =
+      s"""
+         |if (${jsonEval.isNull}) {
+         |  $evaluator.setJson(null);
+         |} else {
+         |  $evaluator.setJson(${jsonEval.value});
+         |}
+         |""".stripMargin
+    val setPath = if (!path.foldable) {
+      s"""
+         |if (${pathEval.isNull}) {
+         |  $evaluator.setPath(null);
+         |} else {
+         |  $evaluator.setPath(${pathEval.value});
+         |}
+         |""".stripMargin
+    } else {
+      ""
+    }
+
+    val resultType = CodeGenerator.boxedType(dataType)
+    val resultTerm = ctx.freshName("result")
+    ev.copy(code =
+      code"""
+         |${jsonEval.code}
+         |${pathEval.code}
+         |$setJson
+         |$setPath
+         |$resultType $resultTerm = ($resultType) $evaluator.evaluate();
+         |boolean ${ev.isNull} = $resultTerm == null;
+         |${CodeGenerator.javaType(dataType)} ${ev.value} = 
${CodeGenerator.defaultValue(dataType)};
+         |if (!${ev.isNull}) {
+         |  ${ev.value} = $resultTerm;
+         |}
+         |""".stripMargin
+    )
+  }
+
+  override protected def withNewChildrenInternal(
+      newLeft: Expression, newRight: Expression): GetJsonObject =
+    copy(json = newLeft, path = newRight)
+}
+
+class GetJsonObjectEvaluator(cachedPath: UTF8String) {
+  import com.fasterxml.jackson.core.JsonToken._
+  import PathInstruction._
+  import SharedFactory._
+  import WriteStyle._
+
+  def this() = this(null)
+
+  @transient
+  private lazy val parsedPath: Option[List[PathInstruction]] =
+    parsePath(cachedPath)
+
+  @transient
+  private var jsonStr: UTF8String = null
+
+  @transient
+  private var pathStr: UTF8String = null
+
+  def setJson(arg: UTF8String): Unit = {
+    jsonStr = arg
+  }
+
+  def setPath(arg: UTF8String): Unit = {
+    pathStr = arg
+  }
+
+  def evaluate(): Any = {
     if (jsonStr == null) {
       return null
     }
 
-    val parsed = if (path.foldable) {
+    val parsed = if (cachedPath != null) {
       parsedPath
     } else {
-      parsePath(path.eval(input).asInstanceOf[UTF8String])
+      parsePath(pathStr)
     }
 
     if (parsed.isDefined) {
@@ -294,7 +385,7 @@ case class GetJsonObject(json: Expression, path: Expression)
           g.writeRawValue(buf.toString)
         } else if (dirty == 1) {
           // remove outer array tokens
-          g.writeRawValue(buf.substring(1, buf.length()-1))
+          g.writeRawValue(buf.substring(1, buf.length() - 1))
         } // else do not write anything
 
         dirty > 0
@@ -337,10 +428,6 @@ case class GetJsonObject(json: Expression, path: 
Expression)
         false
     }
   }
-
-  override protected def withNewChildrenInternal(
-      newLeft: Expression, newRight: Expression): GetJsonObject =
-    copy(json = newLeft, path = newRight)
 }
 
 // scalastyle:off line.size.limit line.contains.tab
diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt 
b/sql/core/benchmarks/JsonBenchmark-results.txt
index 035e0165ffd..ae4a9ae0c79 100644
--- a/sql/core/benchmarks/JsonBenchmark-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-results.txt
@@ -3,127 +3,128 @@ Benchmark for performance of JSON parsing
 
================================================================================================
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 JSON schema inferring:                    Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-No encoding                                        2858           2897         
 62          1.7         571.7       1.0X
-UTF-8 is set                                       4281           4291         
  9          1.2         856.1       0.7X
+No encoding                                        2929           3010         
 86          1.7         585.9       1.0X
+UTF-8 is set                                       4313           4344         
 41          1.2         862.5       0.7X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 count a short column:                     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-No encoding                                        3070           3076         
  5          1.6         614.1       1.0X
-UTF-8 is set                                       4641           4666         
 22          1.1         928.2       0.7X
+No encoding                                        2797           2857         
 68          1.8         559.4       1.0X
+UTF-8 is set                                       4262           4281         
 17          1.2         852.4       0.7X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 count a wide column:                      Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-No encoding                                        4258           4424         
282          0.2        4258.4       1.0X
-UTF-8 is set                                       6180           6194         
 18          0.2        6180.0       0.7X
+No encoding                                        4265           4360         
 88          0.2        4265.4       1.0X
+UTF-8 is set                                       6400           6434         
 29          0.2        6400.4       0.7X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 select wide row:                          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-No encoding                                       12765          12772         
 11          0.0      255294.1       1.0X
-UTF-8 is set                                      14144          14209         
 78          0.0      282874.0       0.9X
+No encoding                                       12301          12381         
113          0.0      246024.1       1.0X
+UTF-8 is set                                      13846          13912         
 57          0.0      276925.6       0.9X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Select a subset of 10 columns:            Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns                                  2352           2372         
 25          0.4        2352.3       1.0X
-Select 1 column                                    1683           1705         
 28          0.6        1682.6       1.4X
+Select 10 columns                                  2316           2323         
  7          0.4        2316.3       1.0X
+Select 1 column                                    1702           1717         
 17          0.6        1702.0       1.4X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 creation of JSON parser per line:         Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Short column without encoding                       873            890         
 22          1.1         873.3       1.0X
-Short column with UTF-8                            1169           1177         
 14          0.9        1168.5       0.7X
-Wide column without encoding                       7404           8401        
1445          0.1        7404.1       0.1X
-Wide column with UTF-8                             9207           9222         
 16          0.1        9207.2       0.1X
+Short column without encoding                       827            850         
 22          1.2         827.1       1.0X
+Short column with UTF-8                            1111           1116         
  7          0.9        1111.0       0.7X
+Wide column without encoding                       7409           7447         
 50          0.1        7409.2       0.1X
+Wide column with UTF-8                            10580          10616         
 34          0.1       10580.4       0.1X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 JSON functions:                           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Text read                                            81             93         
 11         12.4          80.8       1.0X
-from_json                                          1824           1866         
 45          0.5        1823.7       0.0X
-json_tuple                                         1716           1737         
 23          0.6        1716.2       0.0X
-get_json_object                                    1623           1637         
 22          0.6        1622.6       0.0X
+Text read                                            88             92         
  6         11.3          88.3       1.0X
+from_json                                          2083           2091         
  7          0.5        2083.1       0.0X
+json_tuple                                         2101           2133         
 42          0.5        2101.4       0.0X
+get_json_object wholestage off                     2032           2037         
  8          0.5        2032.0       0.0X
+get_json_object wholestage on                      1917           1926         
 10          0.5        1917.3       0.0X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Dataset of json strings:                  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Text read                                           342            344         
  2         14.6          68.3       1.0X
-schema inferring                                   2213           2218         
  5          2.3         442.6       0.2X
-parsing                                            3734           3740         
  5          1.3         746.9       0.1X
+Text read                                           351            351         
  0         14.3          70.1       1.0X
+schema inferring                                   2342           2344         
  4          2.1         468.3       0.1X
+parsing                                            3728           3751         
 26          1.3         745.6       0.1X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Json files in the per-line mode:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Text read                                           917            923         
  6          5.4         183.5       1.0X
-Schema inferring                                   2952           2956         
  4          1.7         590.4       0.3X
-Parsing without charset                            3979           3988         
 10          1.3         795.8       0.2X
-Parsing with UTF-8                                 5459           5464         
  6          0.9        1091.9       0.2X
+Text read                                           876            883         
  9          5.7         175.2       1.0X
+Schema inferring                                   3072           3082         
 14          1.6         614.4       0.3X
+Parsing without charset                            3870           3877         
  7          1.3         774.1       0.2X
+Parsing with UTF-8                                 5287           5290         
  5          0.9        1057.3       0.2X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                      170            171         
  2          5.9         169.6       1.0X
-to_json(timestamp)                                 1033           1036         
  4          1.0        1032.6       0.2X
-write timestamps to files                           925            934         
  8          1.1         924.9       0.2X
-Create a dataset of dates                           171            177         
  6          5.8         171.5       1.0X
-to_json(date)                                       741            743         
  5          1.4         740.7       0.2X
-write dates to files                                616            624         
 11          1.6         616.3       0.3X
+Create a dataset of timestamps                      193            200         
 10          5.2         192.5       1.0X
+to_json(timestamp)                                 1034           1044         
 14          1.0        1033.6       0.2X
+write timestamps to files                           945            966         
 26          1.1         945.0       0.2X
+Create a dataset of dates                           200            205         
  6          5.0         199.8       1.0X
+to_json(date)                                       757            763         
  6          1.3         757.0       0.3X
+write dates to files                                647            660         
 20          1.5         646.8       0.3X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Read dates and timestamps:                                             Best 
Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
-----------------------------------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                                                 
  222            225           4          4.5         221.7       1.0X
-read timestamps from files                                                     
 2595           2634          46          0.4        2595.4       0.1X
-infer timestamps from files                                                    
 6351           6359           8          0.2        6350.7       0.0X
-read date text from files                                                      
  203            207           5          4.9         203.2       1.1X
-read date from files                                                           
  973            978           4          1.0         973.2       0.2X
-timestamp strings                                                              
  220            225           5          4.6         219.7       1.0X
-parse timestamps from Dataset[String]                                          
 2812           2815           3          0.4        2811.5       0.1X
-infer timestamps from Dataset[String]                                          
 6520           6523           4          0.2        6519.6       0.0X
-date strings                                                                   
  294            304           9          3.4         293.6       0.8X
-parse dates from Dataset[String]                                               
 1355           1359           6          0.7        1354.5       0.2X
-from_json(timestamp)                                                           
 3797           3800           2          0.3        3797.2       0.1X
-from_json(date)                                                                
 2267           2282          13          0.4        2266.8       0.1X
-infer error timestamps from Dataset[String] with default format                
 1863           1864           1          0.5        1862.5       0.1X
-infer error timestamps from Dataset[String] with user-provided format          
 1849           1855           6          0.5        1849.2       0.1X
-infer error timestamps from Dataset[String] with legacy format                 
 1832           1847          24          0.5        1831.7       0.1X
+read timestamp text from files                                                 
  227            231           4          4.4         227.3       1.0X
+read timestamps from files                                                     
 2670           2725          70          0.4        2670.2       0.1X
+infer timestamps from files                                                    
 6703           6714          17          0.1        6703.1       0.0X
+read date text from files                                                      
  201            205           5          5.0         200.8       1.1X
+read date from files                                                           
  944            951           7          1.1         944.0       0.2X
+timestamp strings                                                              
  219            224           6          4.6         218.9       1.0X
+parse timestamps from Dataset[String]                                          
 2847           2856           8          0.4        2847.3       0.1X
+infer timestamps from Dataset[String]                                          
 6725           6737          13          0.1        6724.9       0.0X
+date strings                                                                   
  300            304           4          3.3         299.6       0.8X
+parse dates from Dataset[String]                                               
 1230           1245          16          0.8        1230.5       0.2X
+from_json(timestamp)                                                           
 4123           4125           2          0.2        4123.0       0.1X
+from_json(date)                                                                
 2574           2585           9          0.4        2574.4       0.1X
+infer error timestamps from Dataset[String] with default format                
 1871           1878           8          0.5        1870.8       0.1X
+infer error timestamps from Dataset[String] with user-provided format          
 1869           1877          13          0.5        1868.9       0.1X
+infer error timestamps from Dataset[String] with legacy format                 
 1847           1875          43          0.5        1847.2       0.1X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Filters pushdown:                         Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-w/o filters                                       18958          18970         
 10          0.0      189581.8       1.0X
-pushdown disabled                                 18640          18656         
 15          0.0      186401.4       1.0X
-w/ filters                                          874            881         
  6          0.1        8742.7      21.7X
+w/o filters                                       19347          19382         
 40          0.0      193474.6       1.0X
+pushdown disabled                                 19320          19329         
 11          0.0      193196.4       1.0X
+w/ filters                                          897            898         
  1          0.1        8968.3      21.6X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Partial JSON results:                     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-parse invalid JSON                                 3533           3693         
239          0.0      353318.7       1.0X
+parse invalid JSON                                 3398           3589         
249          0.0      339830.8       1.0X
 
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index b7b34129a95..51e66f40121 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.{SparkException, 
SparkRuntimeException}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Literal, StructsToJson}
 import org.apache.spark.sql.catalyst.expressions.Cast._
+import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -1397,4 +1398,31 @@ class JsonFunctionsSuite extends QueryTest with 
SharedSparkSession {
     checkAnswer(df.selectExpr("json_object_keys(a)"), expected)
     checkAnswer(df.select(json_object_keys($"a")), expected)
   }
+
+  test("function get_json_object - Codegen Support") {
+    withTempView("GetJsonObjectTable") {
+      val data = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", 
"jstring")
+      data.createOrReplaceTempView("GetJsonObjectTable")
+      val df = sql("SELECT key, get_json_object(jstring, '$.f1') FROM 
GetJsonObjectTable")
+      val plan = df.queryExecution.executedPlan
+      assert(plan.isInstanceOf[WholeStageCodegenExec])
+      checkAnswer(df, Seq(Row("1", "value1")))
+    }
+  }
+
+  test("function get_json_object - path is null") {
+    val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+    val df = data.selectExpr("get_json_object(a, null)")
+    val plan = df.queryExecution.executedPlan
+    assert(plan.isInstanceOf[WholeStageCodegenExec])
+    checkAnswer(df, Row(null))
+  }
+
+  test("function get_json_object - json is null") {
+    val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+    val df = data.selectExpr("get_json_object(null, '$.name')")
+    val plan = df.queryExecution.executedPlan
+    assert(plan.isInstanceOf[WholeStageCodegenExec])
+    checkAnswer(df, Row(null))
+  }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
index 5b86543648f..02ed2a16d11 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
@@ -272,9 +272,18 @@ object JsonBenchmark extends SqlBasedBenchmark {
       json_tuple_ds.noop()
     }
 
-    benchmark.addCase("get_json_object", iters) { _ =>
-      val get_json_object_ds = in.select(get_json_object($"value", "$.a"))
-      get_json_object_ds.noop()
+    benchmark.addCase("get_json_object wholestage off", iters) { _ =>
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
+        val get_json_object_ds = in.select(get_json_object($"value", "$.a"))
+        get_json_object_ds.noop()
+      }
+    }
+
+    benchmark.addCase("get_json_object wholestage on", iters) { _ =>
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") {
+        val get_json_object_ds = in.select(get_json_object($"value", "$.a"))
+        get_json_object_ds.noop()
+      }
     }
 
     benchmark.run()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-42881][SQL] Codegen Support for get_json_object

Reply via email to