This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new fc6d0a4287 Fix shuffle function to report nullability correctly 
(#19184)
fc6d0a4287 is described below

commit fc6d0a4287ab8054d9f56c9ce0a13ffc0ec3c170
Author: harshit saini <[email protected]>
AuthorDate: Mon Dec 8 01:39:22 2025 +0530

    Fix shuffle function to report nullability correctly (#19184)
    
    # fix: shuffle should report nullability correctly
    
    - Replace return_type with return_field_from_args to preserve input
    nullability
    - Add test to verify nullability is correctly reported
    - Addresses issue #19145
    
    ## Which issue does this PR close?
    
    Closes #19145
    
    ## Rationale for this change
    
    The `shuffle` UDF was using the default `is_nullable` implementation
    which always returns `true`, regardless of the input array's
    nullability. This causes:
    1. Incorrect schema inference - non-nullable inputs are incorrectly
    marked as nullable
    2. Missed optimization opportunities - the query optimizer cannot apply
    certain optimizations when nullability information is incorrect
    3. Potential runtime errors - incorrect metadata can lead to unexpected
    behavior in downstream operations
    
    The shuffle function simply reorders elements within an array without
    changing the array's structure or nullability, so the output should have
    the same nullability as the input.
    
    ## What changes are included in this PR?
    
    1. **Implemented `return_field_from_args`**: Returns the input field
    directly, preserving both data type and nullability
    2. **Updated `return_type`**: Now returns an error directing users to
    use `return_field_from_args` instead (following DataFusion best
    practices)
    3. **Added comprehensive tests**: Verifies that both nullable and
    non-nullable inputs are handled correctly
    
    ## Are these changes tested?
    
    Yes, this PR includes a new test `test_shuffle_nullability` that
    verifies:
    - Non-nullable array input produces non-nullable output
    - Nullable array input produces nullable output
    - Data types are preserved correctly in both cases
    
    Test results:
---
 datafusion/spark/src/function/array/shuffle.rs | 64 ++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/datafusion/spark/src/function/array/shuffle.rs 
b/datafusion/spark/src/function/array/shuffle.rs
index 9f345b53b8..9fb37a4e78 100644
--- a/datafusion/spark/src/function/array/shuffle.rs
+++ b/datafusion/spark/src/function/array/shuffle.rs
@@ -26,7 +26,9 @@ use arrow::datatypes::FieldRef;
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::{exec_err, utils::take_function_args, Result, 
ScalarValue};
+use datafusion_common::{
+    exec_err, internal_err, utils::take_function_args, Result, ScalarValue,
+};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, 
ScalarUDFImpl,
     Signature, TypeSignature, Volatility,
@@ -87,8 +89,16 @@ impl ScalarUDFImpl for SparkShuffle {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        // Shuffle returns an array with the same type and nullability as the 
input
+        Ok(Arc::clone(&args.arg_fields[0]))
     }
 
     fn invoke_with_args(
@@ -263,3 +273,51 @@ fn fixed_size_array_shuffle(
         Some(nulls.into()),
     )?))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_shuffle_nullability() {
+        let shuffle = SparkShuffle::new();
+
+        // Test with non-nullable array
+        let non_nullable_field = Arc::new(Field::new(
+            "arr",
+            List(Arc::new(Field::new("item", DataType::Int32, true))),
+            false, // not nullable
+        ));
+
+        let result = shuffle
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // The result should not be nullable (same as input)
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), non_nullable_field.data_type());
+
+        // Test with nullable array
+        let nullable_field = Arc::new(Field::new(
+            "arr",
+            List(Arc::new(Field::new("item", DataType::Int32, true))),
+            true, // nullable
+        ));
+
+        let result = shuffle
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // The result should be nullable (same as input)
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), nullable_field.data_type());
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to