Kimahriman commented on code in PR #34558:
URL: https://github.com/apache/spark/pull/34558#discussion_r3293465912


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala:
##########
@@ -622,6 +753,72 @@ case class ArrayFilter(
     new GenericArrayData(buffer)
   }
 
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    ctx.withLambdaVars(Seq(elementVar) ++ indexVar, varCodes => {
+      val elementCode = varCodes.head
+      val indexCode = varCodes.tail.headOption
+
+      nullSafeCodeGen(ctx, ev, arg => {
+        val numElements = ctx.freshName("numElements")
+        val count = ctx.freshName("count")
+        val arrayTracker = ctx.freshName("arrayTracker")
+        val arrayData = ctx.freshName("arrayData")
+        val i = ctx.freshName("i")
+        val j = ctx.freshName("j")
+
+        val arrayType = dataType.asInstanceOf[ArrayType]
+
+        val trackerInit = CodeGenerator.createArrayData(
+          arrayTracker, BooleanType, numElements, s" $prettyName failed.")
+        val resultInit = CodeGenerator.createArrayData(
+          arrayData, arrayType.elementType, count, s" $prettyName failed.")
+
+        val functionCode = function.genCode(ctx)
+
+        val elementAssignment = assignArrayElement(ctx, arg, elementCode, 
elementVar, i)
+        val indexAssignment = indexCode.map(c => assignIndex(ctx, c, 
indexVar.get, i))
+        val varAssignments = (Seq(elementAssignment) ++ 
indexAssignment).mkString("\n")
+
+        val resultAssignment = CodeGenerator.setArrayElement(arrayTracker, 
BooleanType,
+          i, functionCode.value, isNull = None)
+
+        val getTrackerValue = CodeGenerator.getValue(arrayTracker, 
BooleanType, i)
+        val copy = CodeGenerator.createArrayAssignment(arrayData, 
arrayType.elementType, arg,
+          j, i, arrayType.containsNull)
+
+        // This takes a two passes to avoid evaluating the predicate multiple 
times
+        // The first pass evaluates each element in the array, tracks how many 
elements
+        // returned true, and tracks the result of each element in a boolean 
array `arrayTracker`.
+        // The second pass copies elements from the original array to the new 
array created
+        // based on the number of elements matching the first pass.
+
+        s"""
+            |final int $numElements = $arg.numElements();
+            |$trackerInit
+            |int $count = 0;
+            |for (int $i = 0; $i < $numElements; $i++) {
+            |  $varAssignments
+            |  ${functionCode.code}
+            |  $resultAssignment
+            |  if ((boolean)${functionCode.value}) {

Review Comment:
   Fixed in `eecc672cb44`.
   
   Generated `ArrayFilter` now computes a local `keep` predicate as 
`!functionCode.isNull && functionCode.value` and uses that value for both the 
tracker array and the retained element count. I added a Catalyst regression 
with custom `GenericArrayData` that keeps `isNullAt=true` while exposing a 
stale typed boolean value, which fails against the old generated path.
   
   Verified with:
   
   ```bash
   build/sbt 'catalyst/testOnly 
org.apache.spark.sql.catalyst.expressions.HigherOrderFunctionsSuite -- -z 
"generated code"'
   ```
   
   Result: passed (`2` tests, `0` failures).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to