Github user maropu commented on a diff in the pull request: https://github.com/apache/spark/pull/18931#discussion_r136710928 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -149,14 +149,146 @@ trait CodegenSupport extends SparkPlan { ctx.freshNamePrefix = parent.variablePrefix val evaluated = evaluateRequiredVariables(output, inputVars, parent.usedInputs) + + // Under certain conditions, we can put the logic to consume the rows of this operator into + // another function. So we can prevent a generated function too long to be optimized by JIT. + // The conditions: + // 1. The parent uses all variables in output. we can't defer variable evaluation when consume + // in another function. + // 2. The output variables are not empty. If it's empty, we don't bother to do that. + // 3. We don't use row variable. The construction of row uses deferred variable evaluation. We + // can't do it. + // 4. The number of output variables must less than maximum number of parameters in Java method + // declaration. + val requireAllOutput = output.forall(parent.usedInputs.contains(_)) + val consumeFunc = + if (row == null && outputVars.nonEmpty && requireAllOutput && outputVars.length < 255) { + constructDoConsumeFunction(ctx, inputVars) + } else { + parent.doConsume(ctx, inputVars, rowVar) + } s""" |${ctx.registerComment(s"CONSUME: ${parent.simpleString}")} |$evaluated - |${parent.doConsume(ctx, inputVars, rowVar)} + |$consumeFunc + """.stripMargin + } + + /** + * To prevent concatenated function growing too long to be optimized by JIT. Instead of inlining, + * we may put the consume logic of parent operator into a function and set this flag to `true`. + * The parent operator can know if its consume logic is inlined or in separated function. + */ + private var doConsumeInFunc: Boolean = false + + /** + * Returning true means we have at least one consume logic from child operator or this operator is + * separated in a function. If this is `true`, this operator shouldn't use `continue` statement to + * continue on next row, because its generated codes aren't enclosed in main while-loop. + * + * For example, we have generated codes for a query plan like: + * Op1Exec + * Op2Exec + * Op3Exec + * + * If we put the consume code of Op2Exec into a separated function, the generated codes are like: + * while (...) { + * ... // logic of Op3Exec. + * Op2Exec_doConsume(...); + * } + * private boolean Op2Exec_doConsume(...) { + * ... // logic of Op2Exec to consume rows. + * } + * For now, `doConsumeInChainOfFunc` of Op2Exec will be `true`. + * + * Notice for some operators like `HashAggregateExec`, it doesn't chain previous consume functions + * but begins with its produce framework. We should override `doConsumeInChainOfFunc` to return + * `false`. + */ + protected def doConsumeInChainOfFunc: Boolean = { + val codegenChildren = children.map(_.asInstanceOf[CodegenSupport]) + doConsumeInFunc || codegenChildren.exists(_.doConsumeInChainOfFunc) + } + + /** + * The actual java statement this operator should use if there is a need to continue on next row + * in its `doConsume` codes. + * + * while (...) { + * ... // logic of Op3Exec. + * Op2Exec_doConsume(...); + * } + * private boolean Op2Exec_doConsume(...) { + * ... // logic of Op2Exec to consume rows. + * continue; // Wrong. We can't use continue with the while-loop. + * } + * In above code, we can't use `continue` in `Op2Exec_doConsume`. + * + * Instead, we do something like: + * while (...) { + * ... // logic of Op3Exec. + * boolean continueForLoop = Op2Exec_doConsume(...); + * if (continueForLoop) continue; + * } + * private boolean Op2Exec_doConsume(...) { + * ... // logic of Op2Exec to consume rows. + * return true; // When we need to do continue, we return true. + * } + */ + protected def continueStatementInDoConsume: String = if (doConsumeInChainOfFunc) { + "return true;"; + } else { + "continue;" + } + + /** + * To prevent concatenated function growing too long to be optimized by JIT. We can separate the + * parent's `doConsume` codes of a `CodegenSupport` operator into a function to call. + */ + protected def constructDoConsumeFunction( --- End diff -- `private`?
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org