[GitHub] [spark] maropu commented on a change in pull request #20965: [SPARK-21870][SQL] Split aggregation code into small functions

GitBox Wed, 28 Aug 2019 16:48:59 -0700

maropu commented on a change in pull request #20965: [SPARK-21870][SQL] Split 
aggregation code into small functions
URL: https://github.com/apache/spark/pull/20965#discussion_r318837668


 ##########
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
 ##########
 @@ -255,41 +260,148 @@ case class HashAggregateExec(
      """.stripMargin
   }
 
+  // Splits aggregate code into small functions because the most of JVM 
implementations
+  // can not compile too long functions.
+  //
+  // Note: The difference from `CodeGenerator.splitExpressions` is that we 
define an individual
+  // function for each aggregation function (e.g., SUM and AVG). For example, 
in a query
+  // `SELECT SUM(a), AVG(a) FROM VALUES(1) t(a)`, we define two functions
+  // for `SUM(a)` and `AVG(a)`.
+  private def splitAggregateExpressions(
+      ctx: CodegenContext,
+      aggNames: Seq[String],
+      aggExprs: Seq[Seq[Expression]],
+      makeSplitAggFunctions: => Seq[String],
+      subExprs: Map[Expression, SubExprEliminationState]): Option[String] = {
+    val inputVars = aggExprs.map { aggExprsInAgg =>
+      val inputVarsInAgg = aggExprsInAgg.map(
+        CodeGenerator.getLocalInputVariableValues(ctx, _, subExprs)).reduce(_ 
++ _).toSeq
+      val paramLength = 
CodeGenerator.calculateParamLengthFromExprValues(inputVarsInAgg)
+
+      // Checks if a parameter length for the `aggExprsInAgg` does not go over 
the JVM limit
+      if (CodeGenerator.isValidParamLength(paramLength)) {
+        Some(inputVarsInAgg)
+      } else {
+        None
+      }
+    }
+
+    // Checks if all the aggregate code can be split into pieces.
+    // If the parameter length of at lease one `aggExprsInAgg` goes over the 
limit,
+    // we totally give up splitting aggregate code.
+    if (inputVars.forall(_.isDefined)) {
+      val splitAggEvalCodes = makeSplitAggFunctions
+      val splitCodes = inputVars.flatten.zipWithIndex.map { case (args, i) =>
+        val doAggVal = ctx.freshName(s"doAggregateVal_${aggNames(i)}")
+        val argList = args.map(v => s"${v.javaType.getName} 
${v.variableName}").mkString(", ")
+        val doAggValFuncName = ctx.addNewFunction(doAggVal,
+          s"""
+             | private void $doAggVal($argList) throws java.io.IOException {
+             |   ${splitAggEvalCodes(i)}
+             | }
+           """.stripMargin)
+
+        val inputVariables = args.map(_.variableName).mkString(", ")
+        s"$doAggValFuncName($inputVariables);"
+      }
+      Some(splitCodes.mkString("\n").trim)
+    } else {
+      val errMsg = "Failed to split aggregate code into small functions 
because the parameter " +
+        "length of at least one split function went over the JVM limit: " +
+        CodeGenerator.MAX_JVM_METHOD_PARAMS_LENGTH
+      if (Utils.isTesting) {
+        throw new IllegalStateException(errMsg)
+      } else {
+        logInfo(errMsg)
+        None
+      }
+    }
+  }
+
   private def doConsumeWithoutKeys(ctx: CodegenContext, input: Seq[ExprCode]): 
String = {
     // only have DeclarativeAggregate
     val functions = 
aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
     val inputAttrs = functions.flatMap(_.aggBufferAttributes) ++ child.output
-    val updateExpr = aggregateExpressions.flatMap { e =>
+    val updateExprs = aggregateExpressions.map { e =>
       e.mode match {
         case Partial | Complete =>
           
e.aggregateFunction.asInstanceOf[DeclarativeAggregate].updateExpressions
         case PartialMerge | Final =>
           
e.aggregateFunction.asInstanceOf[DeclarativeAggregate].mergeExpressions
       }
     }
-    ctx.currentVars = bufVars ++ input
-    val boundUpdateExpr = bindReferences(updateExpr, inputAttrs)
-    val subExprs = 
ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExpr)
+    ctx.currentVars = bufVars.flatten ++ input
+    val boundUpdateExprs = updateExprs.map { updateExprsInAgg =>
+      updateExprsInAgg.map(BindReferences.bindReference(_, inputAttrs))
+    }
+    val subExprs = 
ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExprs.flatten)
     val effectiveCodes = subExprs.codes.mkString("\n")
-    val aggVals = ctx.withSubExprEliminationExprs(subExprs.states) {
-      boundUpdateExpr.map(_.genCode(ctx))
+    val aggVals = boundUpdateExprs.map { boundUpdateExprsInAgg =>
+      ctx.withSubExprEliminationExprs(subExprs.states) {
+        boundUpdateExprsInAgg.map(_.genCode(ctx))
+      }
     }
-    // aggregate buffer should be updated atomic
-    val updates = aggVals.zipWithIndex.map { case (ev, i) =>
+
+    lazy val nonSplitAggCode = {
+      // aggregate buffer should be updated atomically
+      val updates = aggVals.flatten.zip(bufVars.flatten).map { case (ev, 
bufVar) =>
+        s"""
+           | ${bufVar.isNull} = ${ev.isNull};
+           | ${bufVar.value} = ${ev.value};
+         """.stripMargin
+      }
       s"""
-         | ${bufVars(i).isNull} = ${ev.isNull};
-         | ${bufVars(i).value} = ${ev.value};
+         | // do aggregate
+         | // common sub-expressions
+         | $effectiveCodes
+         | // evaluate aggregate functions
+         | ${evaluateVariables(aggVals.flatten)}
+         | // update aggregation buffers
+         | ${updates.mkString("\n").trim}
        """.stripMargin
     }
-    s"""
-       | // do aggregate
-       | // common sub-expressions
-       | $effectiveCodes
-       | // evaluate aggregate function
-       | ${evaluateVariables(aggVals)}
-       | // update aggregation buffer
-       | ${updates.mkString("\n").trim}
-     """.stripMargin
+
+    if (conf.codegenSplitAggregateFunc) {
+      val splitAggCode = splitAggregateExpressions(
+        ctx = ctx,
+        aggNames = functions.map(_.prettyName),
+        aggExprs = boundUpdateExprs,
+        makeSplitAggFunctions = {
+          aggVals.zip(bufVars).map { case (aggValsInAgg, bufVarsInAgg) =>
+            // All the update code for aggregation buffers should be placed in 
the end
+            // of each aggregation function code.
+            val updates = aggValsInAgg.zip(bufVarsInAgg).map { case (ev, 
bufVar) =>
+              s"""
+                 | ${bufVar.isNull} = ${ev.isNull};
 
 Review comment:
   removed

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] maropu commented on a change in pull request #20965: [SPARK-21870][SQL] Split aggregation code into small functions

Reply via email to