Re: [PR] Fix async_udf batch size behaviour [datafusion]

via GitHub Thu, 20 Nov 2025 10:40:35 -0800


alamb commented on code in PR #18819:
URL: https://github.com/apache/datafusion/pull/18819#discussion_r2547219912



##########
datafusion/physical-expr/src/async_scalar_function.rs:
##########
@@ -192,10 +192,18 @@ impl AsyncFuncExpr {
             );
         }
 
-        let datas = ColumnarValue::values_to_arrays(&result_batches)?

Review Comment:
   Yeah, I agree with this assesment
   
   Your solution works and looks good to me. I played around with it locally 
and I think using the 
[concat](https://docs.rs/arrow/latest/arrow/compute/kernels/concat/index.html) 
kernel might be a little faster, for your consideration:
   
   Here is what works well for me locally
   
   
   ```diff
   diff --git a/datafusion/physical-expr/src/async_scalar_function.rs 
b/datafusion/physical-expr/src/async_scalar_function.rs
   index 1a794f411b..afb01f7b5e 100644
   --- a/datafusion/physical-expr/src/async_scalar_function.rs
   +++ b/datafusion/physical-expr/src/async_scalar_function.rs
   @@ -16,7 +16,8 @@
    // under the License.
   
    use crate::ScalarFunctionExpr;
   -use arrow::array::{make_array, MutableArrayData, RecordBatch};
   +use arrow::array::RecordBatch;
   +use arrow::compute::concat;
    use arrow::datatypes::{DataType, Field, FieldRef, Schema};
    use datafusion_common::config::ConfigOptions;
    use datafusion_common::Result;
   @@ -192,23 +193,22 @@ impl AsyncFuncExpr {
                );
            }
   
   +        // Create the  all the arrays into a single array
            let datas = result_batches
   -            .iter()
   +            .into_iter()
                .map(|cv| match cv {
   -                ColumnarValue::Array(arr) => Ok(arr.to_data()),
   -                ColumnarValue::Scalar(scalar) => {
   -                    Ok(scalar.to_array_of_size(1)?.to_data())
   -                }
   +                ColumnarValue::Array(arr) => Ok(arr),
   +                ColumnarValue::Scalar(scalar) => 
Ok(scalar.to_array_of_size(1)?),
                })
                .collect::<Result<Vec<_>>>()?;
   
   -        let total_len = datas.iter().map(|d| d.len()).sum();
   -        let mut mutable = MutableArrayData::new(datas.iter().collect(), 
false, total_len);
   -        datas.iter().enumerate().for_each(|(i, data)| {
   -            mutable.extend(i, 0, data.len());
   -        });
   -        let array_ref = make_array(mutable.freeze());
   -        Ok(ColumnarValue::Array(array_ref))
   +        // Get references to the arrays as dyn Array to call concat
   +        let dyn_arrays = datas
   +            .iter()
   +            .map(|arr| arr as &dyn arrow::array::Array)
   +            .collect::<Vec<_>>();
   +        let result_array = concat(&dyn_arrays)?;
   +        Ok(ColumnarValue::Array(result_array))
        }
    }
   ```



##########
datafusion/core/tests/user_defined/user_defined_async_scalar_functions.rs:
##########
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Int32Array, RecordBatch, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use async_trait::async_trait;
+use datafusion::prelude::*;
+use datafusion_common::{assert_batches_eq, Result};
+use datafusion_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+
+// This test checks the case where batch_size doesn't evenly divide
+// the number of rows.
+#[tokio::test]
+async fn test_async_udf_with_non_modular_batch_size() -> Result<()> {
+    let num_rows = 3;
+    let batch_size = 2;
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("prompt", DataType::Utf8, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int32Array::from((0..num_rows).collect::<Vec<i32>>())),
+            Arc::new(StringArray::from(
+                (0..num_rows)
+                    .map(|i| format!("prompt{i}"))
+                    .collect::<Vec<_>>(),
+            )),
+        ],
+    )?;
+
+    let ctx = SessionContext::new();
+    ctx.register_batch("test_table", batch)?;
+
+    ctx.register_udf(
+        AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl::new(batch_size)))
+            .into_scalar_udf(),
+    );
+
+    let df = ctx

Review Comment:
   I verified this test fails without the code fix in this PR
   
   ```
   ---- 
user_defined::user_defined_async_scalar_functions::test_async_udf_with_non_modular_batch_size
 stdout ----
   Error: Internal("Arguments has mixed length. Expected length: 2, found 
length: 1")
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Fix async_udf batch size behaviour [datafusion]

Reply via email to