sunchao commented on code in PR #34558:
URL: https://github.com/apache/spark/pull/34558#discussion_r3293007181


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala:
##########
@@ -622,6 +753,72 @@ case class ArrayFilter(
     new GenericArrayData(buffer)
   }
 
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    ctx.withLambdaVars(Seq(elementVar) ++ indexVar, varCodes => {
+      val elementCode = varCodes.head
+      val indexCode = varCodes.tail.headOption
+
+      nullSafeCodeGen(ctx, ev, arg => {
+        val numElements = ctx.freshName("numElements")
+        val count = ctx.freshName("count")
+        val arrayTracker = ctx.freshName("arrayTracker")
+        val arrayData = ctx.freshName("arrayData")
+        val i = ctx.freshName("i")
+        val j = ctx.freshName("j")
+
+        val arrayType = dataType.asInstanceOf[ArrayType]
+
+        val trackerInit = CodeGenerator.createArrayData(
+          arrayTracker, BooleanType, numElements, s" $prettyName failed.")
+        val resultInit = CodeGenerator.createArrayData(
+          arrayData, arrayType.elementType, count, s" $prettyName failed.")
+
+        val functionCode = function.genCode(ctx)
+
+        val elementAssignment = assignArrayElement(ctx, arg, elementCode, 
elementVar, i)
+        val indexAssignment = indexCode.map(c => assignIndex(ctx, c, 
indexVar.get, i))
+        val varAssignments = (Seq(elementAssignment) ++ 
indexAssignment).mkString("\n")
+
+        val resultAssignment = CodeGenerator.setArrayElement(arrayTracker, 
BooleanType,
+          i, functionCode.value, isNull = None)
+
+        val getTrackerValue = CodeGenerator.getValue(arrayTracker, 
BooleanType, i)
+        val copy = CodeGenerator.createArrayAssignment(arrayData, 
arrayType.elementType, arg,
+          j, i, arrayType.containsNull)
+
+        // This takes a two passes to avoid evaluating the predicate multiple 
times
+        // The first pass evaluates each element in the array, tracks how many 
elements
+        // returned true, and tracks the result of each element in a boolean 
array `arrayTracker`.
+        // The second pass copies elements from the original array to the new 
array created
+        // based on the number of elements matching the first pass.
+
+        s"""
+            |final int $numElements = $arg.numElements();
+            |$trackerInit
+            |int $count = 0;
+            |for (int $i = 0; $i < $numElements; $i++) {
+            |  $varAssignments
+            |  ${functionCode.code}
+            |  $resultAssignment
+            |  if ((boolean)${functionCode.value}) {

Review Comment:
   [P1] Preserve null predicate semantics in generated `ArrayFilter`
   
   `filter` treats a null predicate result as false, but this generated loop 
reads `${functionCode.value}` without checking `${functionCode.isNull}`. I 
reproduced this on the current head with a vectorized Parquet array column 
containing `[true]` and `[null]`: with `CODEGEN_ONLY`, `filter(a, x -> x)` 
returns `[true]` and `[null]` instead of `[true]` and `[]`; with `NO_CODEGEN`, 
it returns the expected results. Please use a predicate equivalent to 
`!functionCode.isNull && functionCode.value` for both the tracker and count, 
and add a generated-code regression test.
   
   _\[ :robot: posted by Codex on behalf of sunchao using the 
[`code-review-for-me`](https://github.com/openai/openai/tree/master/skills/skills/code-review-for-me)
 skill :robot: \]_



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala:
##########
@@ -146,9 +146,13 @@ class EquivalentExpressions(
   // There are some special expressions that we should not recurse into all of 
its children.
   //   1. CodegenFallback: it's children will not be used to generate code 
(call eval() instead)
   //   2. ConditionalExpression: use its children that will always be 
evaluated.
+  //   3. HigherOrderFunction: lambda functions operate in the context of 
local lambdas and can't
+  //        be called outside of that scope, only the arguments can be 
evaluated ahead of
+  //        time.
   private def childrenToRecurse(expr: Expression): Seq[Expression] = expr 
match {
     case _: CodegenFallback => Nil
     case c: ConditionalExpression => 
c.alwaysEvaluatedInputs.map(skipForShortcut)
+    case h: HigherOrderFunction => h.arguments

Review Comment:
   [P1] Do not eagerly evaluate `ArrayAggregate.zero` for null input arrays
   
   For `ArrayAggregate`, `zero` is not evaluated when the array argument is 
null, but recursing into all `h.arguments` allows subexpression elimination to 
generate expressions from `zero` outside that null guard. On the current head, 
with ANSI mode, CSE, and generated code enabled, `aggregate(CAST(NULL AS 
ARRAY<INT>), (cast(id as int) / 0) + (cast(id as int) / 0), (acc, x) -> acc + 
x)` throws `DIVIDE_BY_ZERO`; with CSE disabled, the same generated query 
returns null. Please recurse only into the always-evaluated input array for 
`ArrayAggregate` and add this regression. This is separate from the empty-array 
nullable-accumulator fix in `9f330488952`.
   
   _\[ :robot: posted by Codex on behalf of sunchao using the 
[`code-review-for-me`](https://github.com/openai/openai/tree/master/skills/skills/code-review-for-me)
 skill :robot: \]_



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala:
##########
@@ -354,6 +442,49 @@ case class ArrayTransform(
     result
   }
 
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    ctx.withLambdaVars(Seq(elementVar) ++ indexVar, varCodes => {
+      val elementCode = varCodes.head
+      val indexCode = varCodes.tail.headOption
+
+      nullSafeCodeGen(ctx, ev, arg => {
+        val numElements = ctx.freshName("numElements")
+        val arrayData = ctx.freshName("arrayData")
+        val i = ctx.freshName("i")
+
+        val initialization = CodeGenerator.createArrayData(
+          arrayData, dataType.elementType, numElements, s" $prettyName 
failed.")
+
+        val functionCode = function.genCode(ctx)
+
+        val elementAssignment = assignArrayElement(ctx, arg, elementCode, 
elementVar, i)
+        val indexAssignment = indexCode.map(c => assignIndex(ctx, c, 
indexVar.get, i))
+        val varAssignments = (Seq(elementAssignment) ++ 
indexAssignment).mkString("\n")
+
+        // Some expressions return internal buffers that we have to copy
+        val copy = if (CodeGenerator.isPrimitiveType(function.dataType)) {
+          s"${functionCode.value}"
+        } else {
+          s"InternalRow.copyValue(${functionCode.value})"
+        }
+        val resultNull = if (function.nullable) 
Some(functionCode.isNull.toString) else None
+        val resultAssignment = CodeGenerator.setArrayElement(arrayData, 
dataType.elementType,

Review Comment:
   [P1] Preserve null complex elements in generated array HOF output
   
   This new output-writing path passes a null bit for nullable lambda results 
into `CodeGenerator.setArrayElement`, but that helper applies `setNullAt` only 
for primitive element types. I reproduced this on the current head with native 
vectorized ORC input containing `array(CAST(NULL AS ARRAY<INT>), array(1))`: 
the input reads back with `a[0] IS NULL = true`, and interpreted 
`transform`/`filter` return `[true, true]`, but generated `transform(a, x -> 
x)[0] IS NULL` and `filter(a, x -> x IS NULL)[0] IS NULL` return `[false, 
false]`. Please make nullable non-primitive array writes honor `isNull` and add 
a generated nested-null regression test.
   
   _\[ :robot: posted by Codex on behalf of sunchao using the 
[`code-review-for-me`](https://github.com/openai/openai/tree/master/skills/skills/code-review-for-me)
 skill :robot: \]_



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to